使用多协程和队列,爬取时光网电视剧TOP100的数据(剧名、导演、主演和简介),并用xlsx模块将数据存储下来。
时光网TOP100链接:http://www.mtime.com/top/tv/top100/
from gevent import monkey
monkey.patch_all()
import gevent,requests,openpyxl,time
from gevent.queue import Queue
from bs4 import BeautifulSoup
from selenium import webdriver
driver = webdriver.Chrome()
work=Queue()
csv_list=[]
url_list=['http://www.mtime.com/top/tv/top100/']
url_='http://www.mtime.com/top/tv/top100/'
for item in range(2,10):
url_list.append(url_+'index-'+str(item)+'.html')
for item in url_list:
work.put_nowait(item)
def doing():
while not work.empty():
url=work.get_nowait()
driver.get(url)
time.sleep(2)
items=driver.find_elements_by_class_name("mov_con")
title=""
daoyan=""
yanyuan=""
comment=""
for item in items:
title=item.find_element_by_tag_name('h2').find_element_by_tag_name('a').text
daoyan=item.find_elements_by_tag_name("p")[0].find_element_by_tag_name("a").text
try:
comment=item.find_element_by_class_name("mt3").text
except Exception as err:
comment=""
try:
yanyuan=get_yanyuan(item.find_elements_by_tag_name("p")[1].find_elements_by_tag_name('a'))
except Exception as err:
yanyuan=""
csv_list.append([title,daoyan,yanyuan,comment])
def get_yanyuan(ele):
y_list=[]
print(ele)
for i in ele:
y_list.append(i.text)
return ",".join(y_list)
task_list=[]
for i in range(5):
task=gevent.spawn(doing)
task_list.append(task)
gevent.joinall(task_list)
driver.close()
wb = openpyxl.Workbook()
sheet = wb.active
sheet.title='可恶'
sheet['A1']='剧名'
sheet['B1']='导演'
sheet['C1']='演员'
sheet['D1']='评论'
for item in csv_list:
sheet.append(item)
wb.save('剧组.xlsx')

缺点一:100条数据只有90条
缺点二:try except 的使用
由于原网页的渲染时js渲染,所以要使用selenium库