本文由于爬取了短评,未处理数据,结果中存在部分剧透!!慎读!
结果展示
主要通过爬取复联4豆瓣短评中前500条好评与前500条差评(因为豆瓣限制,只能爬前500条),并生成词云图:
环境
开发环境:
Anaconda for Win 10
使用库:
import csv
import threading
import requests
from lxml import etree
from queue import Queue
import matplotlib.pyplot as plt
from wordcloud import WordCloud
构建思路
采取了多线程,定义了生产者和消费者。生成者负责对各网页进行处理爬取短评,消费者负责对所有短评数据进行存档。最后定义词云函数对短评文件进行处理,生成词云图。
生产者
class Producer(threading.Thread):
def __init__(self,page_queue,txt_queue,*args,**kwargs):
super(Producer,self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.txt_queue = txt_queue
def run(self):
#豆瓣头文件
headers={'Cookie':'bid=hYFC5UWAFiI; ll="108304"; __yadk_uid=PzdeB6YbnuA2rDPA6jZ9nncilepjJkAf; douban-fav-remind=1; _vwo_uuid_v2=D02532D1623FA1D8C30BAAE1CADB3E2A7|4ecef3154f0aa2da14541c3cc4b1e036; __utmz=223695111.1555401670.5.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; viewed="30358316"; __utmz=30149280.1555551053.12.9.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; gr_user_id=d172f4c3-ff59-4ad6-aab4-9a35af4961a5; __utmc=30149280; __utmc=223695111; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1556178683%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1907440475.1551581607.1556174773.1556178684.15; __utmt=1; __utmt_douban=1; __utma=223695111.1695363806.1551581607.1556174773.1556178686.8; __utmb=223695111.0.10.1556178686; dbcl2="195512317:IopWWGuhcvY"; ck=qC8B; push_noty_num=0; push_doumail_num=0; __utmv=30149280.19551; _pk_id.100001.4cf6=efcfe6a0fefef1cc.1551581609.7.1556178890.1556175418.; __utmb=30149280.24.10.1556178684',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
resp=requests.get(url,headers=headers)
text=resp.text
html=etree.HTML(text)
txts=html.xpath('//span[@class="short"]/text()')
for txt in txts:
self.txt_queue.put(txt)
消费者
class Consumer(threading.Thread):
def __init__(self,txt_queue,gLock,fp,writer,*args,**kwargs):
super(Consumer,self).__init__(*args,**kwargs)
self.txt_queue = txt_queue
self.lock = gLock
self.writer = writer
self.fp= fp
self.num=0
def run(self):
while True:
try:
txt=self.txt_queue.get(timeout=40)
self.lock.acquire()
self.writer.writerow([txt])
self.lock.release()
except:
self.fp.close()
break
在消费者中使用了抛出异常来处理当消费队列为空时跳出循环。
词云绘制函数
def get_word(file,name):
print('开始绘制')
ft = open(file, 'r',encoding='utf-8')
f=ft.read()
wordcloud = WordCloud(
collocations=False,
width=2000,
height=1860,
margin=2,
font_path=r'C:\WINDOWS\FONTS\DENGL.TTF'
).generate(f)
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
wordcloud.to_file(name)
print("词云已生成")
ft.close()
主函数
def main():
page_queue = Queue(100)
txt_queue = Queue(1000)
gLock=threading.Lock()
fp=open('endgameh.csv', 'a',newline='', encoding='utf-8')
writer = csv.writer(fp)
#此处为复联四豆瓣短评好评页面地址
base_url="https://movie.douban.com/subject/26100958/comments?start={}&limit=20&sort=new_score&status=P&percent_type=h"
for x in range(0,30):
url = base_url.format(x*20)
page_queue.put(url)
for x in range(5):
t = Producer(page_queue,txt_queue)
t.start()
for x in range(5):
t =Consumer(txt_queue,gLock,fp,writer)
t.start()
while True:
if txt_queue.empty() and page_queue.empty():
get_word("endgameh.csv","good.png")
break
def main_chaping():
page_queue = Queue(100)
txt_queue = Queue(1000)
gLock=threading.Lock()
fp=open('endgamel.csv', 'a',newline='', encoding='utf-8')
writer = csv.writer(fp)
#此处为复联四豆瓣短评差评页面地址
base_url="https://movie.douban.com/subject/26100958/comments?start={}&limit=20&sort=new_score&status=P&percent_type=l"
for x in range(0,30):
url = base_url.format(x*20)
page_queue.put(url)
for x in range(5):
t = Producer(page_queue,txt_queue)
t.start()
for x in range(5):
t =Consumer(txt_queue,gLock,fp,writer)
t.start()
while True:
if txt_queue.empty() and page_queue.empty():
get_word("endgamel.csv","bad.png")
break
if __name__=="__main__":
main()
main_chaping()