python爬虫:多线程爬取Endgame的豆瓣短评并生成词云图

本文采用Python多线程爬取《复联4》(Endgame)豆瓣短评的前500条好评与差评,因存在剧透,请谨慎阅读。通过设置生产者和消费者,生产者负责爬取数据,消费者负责存储,最终利用词云图展示短评内容。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

本文由于爬取了短评,未处理数据,结果中存在部分剧透!!慎读!

结果展示

主要通过爬取复联4豆瓣短评中前500条好评与前500条差评(因为豆瓣限制,只能爬前500条),并生成词云图:
好评
差评

环境

开发环境:
Anaconda for Win 10
使用库:

import csv
import threading
import requests
from lxml import etree
from queue import Queue
import matplotlib.pyplot as plt
from wordcloud import WordCloud

构建思路

采取了多线程,定义了生产者和消费者。生成者负责对各网页进行处理爬取短评,消费者负责对所有短评数据进行存档。最后定义词云函数对短评文件进行处理,生成词云图。

生产者

class Producer(threading.Thread):
    def __init__(self,page_queue,txt_queue,*args,**kwargs):
        super(Producer,self).__init__(*args,**kwargs)
        self.page_queue = page_queue
        self.txt_queue = txt_queue
    def run(self):
    	#豆瓣头文件
        headers={'Cookie':'bid=hYFC5UWAFiI; ll="108304"; __yadk_uid=PzdeB6YbnuA2rDPA6jZ9nncilepjJkAf; douban-fav-remind=1; _vwo_uuid_v2=D02532D1623FA1D8C30BAAE1CADB3E2A7|4ecef3154f0aa2da14541c3cc4b1e036; __utmz=223695111.1555401670.5.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; viewed="30358316"; __utmz=30149280.1555551053.12.9.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; gr_user_id=d172f4c3-ff59-4ad6-aab4-9a35af4961a5; __utmc=30149280; __utmc=223695111; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1556178683%2C%22https%3A%2F%2Fwww.google.com%2F%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1907440475.1551581607.1556174773.1556178684.15; __utmt=1; __utmt_douban=1; __utma=223695111.1695363806.1551581607.1556174773.1556178686.8; __utmb=223695111.0.10.1556178686; dbcl2="195512317:IopWWGuhcvY"; ck=qC8B; push_noty_num=0; push_doumail_num=0; __utmv=30149280.19551; _pk_id.100001.4cf6=efcfe6a0fefef1cc.1551581609.7.1556178890.1556175418.; __utmb=30149280.24.10.1556178684',
                 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
        while True:
            if self.page_queue.empty():
                break
            url = self.page_queue.get()
            resp=requests.get(url,headers=headers)
            text=resp.text
            html=etree.HTML(text)
            txts=html.xpath('//span[@class="short"]/text()')
            for txt in txts:
                self.txt_queue.put(txt)

消费者

class Consumer(threading.Thread):
    def __init__(self,txt_queue,gLock,fp,writer,*args,**kwargs):
        super(Consumer,self).__init__(*args,**kwargs)
        self.txt_queue = txt_queue
        self.lock = gLock
        self.writer = writer
        self.fp= fp 
        self.num=0
    def run(self):
        while True:
            try:
                txt=self.txt_queue.get(timeout=40)
                self.lock.acquire()
                self.writer.writerow([txt])
                self.lock.release()
            except:
                self.fp.close()
                break

在消费者中使用了抛出异常来处理当消费队列为空时跳出循环。

词云绘制函数

def get_word(file,name):
    print('开始绘制')
    ft = open(file, 'r',encoding='utf-8')
    f=ft.read()
    wordcloud = WordCloud(
            collocations=False,
            width=2000,
            height=1860,
            margin=2,
            font_path=r'C:\WINDOWS\FONTS\DENGL.TTF'
            ).generate(f)
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    wordcloud.to_file(name)
    print("词云已生成")
    ft.close()   

主函数

def main():
    page_queue = Queue(100)
    txt_queue = Queue(1000)
    gLock=threading.Lock()
    fp=open('endgameh.csv', 'a',newline='', encoding='utf-8')
    writer = csv.writer(fp)
    #此处为复联四豆瓣短评好评页面地址
    base_url="https://movie.douban.com/subject/26100958/comments?start={}&limit=20&sort=new_score&status=P&percent_type=h"
    for x in range(0,30):
        url = base_url.format(x*20)
        page_queue.put(url)
    for x in range(5):
        t = Producer(page_queue,txt_queue)
        t.start()
    for x in range(5):
        t =Consumer(txt_queue,gLock,fp,writer)
        t.start()
    while True:
        if txt_queue.empty() and page_queue.empty():
            get_word("endgameh.csv","good.png")
            break
           
def main_chaping():
    page_queue = Queue(100)
    txt_queue = Queue(1000)
    gLock=threading.Lock()
    fp=open('endgamel.csv', 'a',newline='', encoding='utf-8')
    writer = csv.writer(fp)
    #此处为复联四豆瓣短评差评页面地址
    base_url="https://movie.douban.com/subject/26100958/comments?start={}&limit=20&sort=new_score&status=P&percent_type=l"
    for x in range(0,30):
        url = base_url.format(x*20)
        page_queue.put(url)
    for x in range(5):
        t = Producer(page_queue,txt_queue)
        t.start()
    for x in range(5):
        t =Consumer(txt_queue,gLock,fp,writer)
        t.start()
    while True:
        if txt_queue.empty() and page_queue.empty():
            get_word("endgamel.csv","bad.png")
            break
            
if __name__=="__main__":
    main()
    main_chaping()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值