爬取豆瓣《八佰》电影短评并生成词云图片
import requests
import lxml.html
import jieba
from wordcloud import WordCloud
def getEssayStr():
s = requests.Session()
headers = {
'Accept':'application/json',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Content-Length':'61',
'Content-Type':'application/x-www-form-urlencoded',
'Cookie':'bid=r07aOl63UBM; douban-fav-remind=1; ll="118282"; __utmv=30149280.20941; __gads=ID=c2fa54dd6107d6b3:T=1583723270:S=ALNI_MY5kWrVPewlS0vnGB36bwbFs_0qdw; gr_user_id=7644d0f5-4a81-44e5-941b-2d5fcbca297a; _vwo_uuid_v2=D825BAE64AADDF363C29403087A2ACDFA|622a22100a0d398aef330deaf0d3f7c7; viewed="2000732_30452948"; ap_v=0,6.0; __utma=30149280.57625699.1577763497.1597474148.1597649665.21; __utmz=30149280.1597649665.21.18.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utmc=30149280; apiKey=; _pk_ses.100001.2fad=*; _pk_ref.100001.2fad=%5B%22%22%2C%22%22%2C1597649914%2C%22https%3A%2F%2Fmovie.douban.com%2F%22%5D; vtoken=phone_reset_password%20a86a91acb7f34d09aeb37e5d6d229f1c; _pk_id.100001.2fad=1556a95e115c5dc3.1579250139.5.1597650007.1597474154.; last_login_way=account; push_noty_num=0; push_doumail_num=0; __utmt=1; __utmb=30149280.6.10.1597649665; login_start_time=1597650107010',
'Host':'accounts.douban.com',
'Origin':'https://accounts.douban.com',
'Referer':'https://accounts.douban.com/passport/login_popup?login_source=anony',
'Sec-Fetch-Mode':'cors',
'Sec-Fetch-Site':'same-origin',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36',
'X-Requested-With':'XMLHttpRequest'
}
login_url = 'https://accounts.douban.com/j/mobile/login/basic'
data = {
'name':'豆瓣账号',
'password':'豆瓣密码',
'remember':'false'
}
try:
r = s.post(url=login_url,data=data,headers=headers)
r.raise_for_status()
except:
print('登录失败')
page = 0
essay_str = ''
while 1:
essay_url = f'https://movie.douban.com/subject/26754233/comments?start={str(page * 20)}&limit=20&sort=new_score&status=P'
essay_html = s.get(essay_url,headers={'User-Agent': 'Mozilla/5.0'}).content.decode()
essay_selector = lxml.html.fromstring(essay_html)
essays = essay_selector.xpath('//div[@class="mod-bd"]/div[@class="comment-item"]')
for essay in essays:
essay_content = essay.xpath('div[@class="comment"]/p/span/text()')[0]
essay_str += essay_content
next_page = essay_selector.xpath('//div[@id="paginator"]/span[@class="next"]')
if len(next_page) != 0:
break
page += 1
return essay_str
def createWordCloud(str):
cut_text = ' '.join(jieba.cut(str))
Cloud = WordCloud(font_path='C:\\Windows\\Fonts\\SIMYOU.TTF', background_color='black', width=900, height=600,max_words=100)
Wcloud = Cloud.generate(cut_text)
Wcloud.to_file('E:\\词云.png')
if __name__ == '__main__':
#获取所有短评组成的字符串
essay_str = getEssayStr()
#生成词云
createWordCloud(essay_str)