基本部署见:
selenium+chromedriver部署到云主机并爬取东方财富网的股票评论(一)
https://blog.youkuaiyun.com/qq_41375702/article/details/95978618
更新的内容:
- 增加了多线程,目前为六个线程。
- 将一部分用selenium完成的代码,用requests实现,提高了运行效率。
Windows下运行结果:
Linux运行结果
下载内容:
源码:
import re
import requests
import os
import time
from selenium import webdriver
import platform
from threading import Thread
class Win():
def __init__(self):
pass
#抓取评论的html页和评论页数
def get_general_page_html(self, url):
'''
option = webdriver.ChromeOptions()
option.add_argument('--headless')
driver = webdriver.Chrome(options=option)
driver.get(url)
'''
html = requests.get(url).text
return html
# 评论页数
def get_num(self, url):
option = webdriver.ChromeOptions()
option.add_argument('--headless')
driver = webdriver.Chrome(options=option)
driver.get(url)
html = driver.page_source
driver.quit()
page = re.findall('sumpage.*?>(\d+)</span>', html, re.S)
try:
num = int(page[0])
except:
print("num = int(page[0]) error 34 lines")
return num
#得到评论列表的url
def get_shrot_url_toConstruct(self,html):
pattern=re.compile('articleh.*?l3 a3.*?href="/(.*?)" title',re.S)
url_list=re.findall(pattern,html)
return url_list
#写入到txt文件
def writen_to_file(sefl,comment, path, filename):
totName = os.path.join(path,filename)
with open(totName + '.txt', 'w+') as f:
f.write(comment.strip())
print(totName,'.txt : 写入完成!')
def get_comment(self,shorturl, base_url_list):
url_list = []
comm_list = []
for one in base_url_list:
url = shorturl + one
url_list.append(url)
for url in url_list:
html = requests.get(url).text
pattern = re.compile('short_text">(.*?)<', re.S)
results = re.findall(pattern, html)
for one in results:
one = one.strip()
comm_list.append(one)
return comm_list
#单个爬取一次评论
def per_run(self,location, path, filename):
url = 'http://guba.eastmoney.com/list,{location}{filename}.html'.format(location=location, filename=filename)
html= self.get_general_page_html(url)
print("Get the html of each page:", url)
base_url_list = self.get_shrot_url_toConstruct(html)
comment = self.get_comment(shorturl='http://guba.eastmoney.com/', base_url_list=base_url_list)
strcom = '\n'.join(comment)
self.writen_to_file(comment=strcom, path=path, filename=filename)
#爬取所有评论
def Run(self):
start = time.time()
print('----------info------------')
shareDict = {}
'''
hk:
00700 腾讯
02331 李宁
01810 小米
03690 美团点评
us:
BIDU 百度
baba 阿里
NKE 耐克
googl 谷歌
'''
shareDict['hk'] = ['00700']
shareDict['us'] = ['baba', 'BIDU']
print('即将爬取 :')
for i in shareDict.items():
print(i)
for location in shareDict.keys():
startOne = time.time()
print('stock exchange: ', location)
for name in shareDict[location]:
if os.path.exists(name):
print("文件夹:", name, ' 已经存在')
else:
print('创建文件夹:', name)
os.mkdir(name)
print('股票 :', name)
pageNum =self.get_num(url='http://guba.eastmoney.com/list,' + str(location + name) + '.html')
print('所有评论页数:', pageNum, ' 页')
tnum1 = int(pageNum / 6)
tnum2 = int(pageNum * 2 / 6)
tnum3 = int(pageNum * 3/ 6)
tnum4 = int(pageNum * 4/ 6)
tnum5 = int(pageNum * 5 / 6)
tnum6 = pageNum
def threRun1():
for i in range(1, tnum1):
synx = name + '_' + str(i)
self.per_run(location=location, path=name, filename=synx)
def threRun2():
for i in range(tnum1, tnum2):
synx = name + '_' + str(i)
self.per_run(location=location, path=name, filename=synx)
def threRun3():
for i in range(tnum2, tnum3):
synx = name + '_' + str(i)
self.per_run(location=location, path=name, filename=synx)
def threRun4():
for i in range(tnum3, tnum4):
synx = name + '_' + str(i)
self.per_run(location=location, path=name, filename=synx)
def threRun5():
for i in range(tnum4, tnum5):
synx = name + '_' + str(i)
self.per_run(location=location, path=name, filename=synx)
def threRun6():
for i in range(tnum5, tnum6):
synx = name + '_' + str(i)
self.per_run(location=location, path=name, filename=synx)
t1=Thread(target=threRun1)
t1.start()
t2=Thread(target=threRun2)
t2.start()
t3=Thread(target=threRun3)
t3.start()
t4 = Thread(target=threRun4)
t4.start()
t5 = Thread(target=threRun5)
t5.start()
t6 = Thread(target=threRun6)
t6.start()
t1.join()
t2.join()
t3.join()
t4.join()
t5.join()
t6.join()
print('总共用时:', time.time() - start, 's')
class Linux(Win):
def __init__(self,path):
self.path=path
# 抓取评论的html页
def get_general_page_html(self,url):
html = requests.get(url).text
return html
# 评论页数
def get_num(self,url):
option = webdriver.ChromeOptions()
option.add_argument('--no-sandbox')
option.add_argument('--headless')
driver = webdriver.Chrome(executable_path=self.path,options=option)
driver.get(url)
html = driver.page_source
driver.quit()
page = re.findall('sumpage.*?>(\d+)</span>', html, re.S)
try:
num = int(page[0])
except:
print("num = int(page[0]) error 153 lines")
return num
if __name__ == '__main__':
platf=platform.platform()
if 'Windows' in platf:
crwal = Win()
crwal.Run()
elif 'Linux' in platf:
path = input('请输入chromedriver路径(默认为/root/sel/chromedriver):').strip()
if len(path) == 0:
path='/root/sel/chromedriver'
else:
path=path.strip()
crwal = Linux(path=path)
crwal.Run()
# try:
# os.popen('mkdir all')
# os.popen('mv *[^all] all')
# os.popen('mv all/df.py .')
# os.popen('zip all.zip all -r')
# os.popen('rm all -rf')
# except:
# print('163~167:os.popen error')
else:
print(platf)
#3threads 6.875s
#6threads 3.229s