import os
import sys
import time
import traceback
import json
import bs4
import pandas as pd
from selenium import webdriver
'''
https://www.jianshu.com/p/4b89c92ff9b4
https://cuiqingcai.com/2577.html
'''
print(sys.getdefaultencoding())
driver = webdriver.PhantomJS(executable_path="./bin/phantomjs.exe")
def func_load_webpage(code):
if not code.isdigit() or len(code) != 6:
return False
base_url = 'https://qieman.com/funds/%s' % code
dict_select = {
'fundname': '',
'scale': '',
'withdrawal': '',
'sharp': '',
'volatility': '',
'code': code,
}
driver.get(base_url)
content = driver.page_source.encode('utf-8')
target = ''
soup = bs4.BeautifulSoup(content, "html.parser")
for co in soup.find_all(['span', 'h1']):
if target != '':
'''dict_select[target] = co.text.encode('utf-8')'''
dict_select[target] = co.text
if target == 'volatility':
break
target = ''
if co.name == 'h1':
dict_select['fundname'] = co.text
if co.text == u'最新规模':
target = 'scale'
elif co.text == u'最大回撤':
target = 'withdrawal'
elif co.text == u'夏普比率':
target = 'sharp'
elif co.text == u'波动率':
target = 'volatility'
return json.dumps(dict_select, ensure_ascii=False)
if __name__ == "__main__":
codelist = ['000216','000961', '001071', '001513', '001549', '001550', '001632', '006751', '003095',
'040046', '110011', '110023', '150303', '161033', '161122', '161130', '161725', '161903','163406', '163415', '519674']
data = []
df = pd.DataFrame()
for item in codelist:
data.append(func_load_webpage(item))
time.sleep(1)
for line in data:
data_item = json.loads(line)
df1 = pd.DataFrame(data_item, index=[0])
df = df.append(df1)
df.to_excel('Fund.xlsx', sheet_name='Data', startcol=0, index=False)
如图:

代码下载