PS:仅为本人笔记
使用requests 爬豆瓣影评
import re
import requests
headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'}
douLists=[]
获取单个页面的信息
def getDou(url):
# 获取页面
res=requests.get(url,headers=headers)
# print(res.text)
# 获取电影名字
movies=re.findall('class="main review-item".*?title="(.*?)".*?</div>',res.text,re.S)
# print(movie)
# 用户ID
u_ids=re.findall('class="main review-item".*?class="name">(.*?)<.*?</div>',res.text,re.S)
# print(u_id)
# 影评标题
titles=re.findall('class="main review-item".*?class="main-bd".*?<a href.*?">(.*?)</a>',res.text,re.S)
# print(title)
# 使用zip函数将上述获得的数据的对应元素放在一起
# 如将[1,2]、['a','b']变成[(1,'a'),(2,'b')]形式,以便对每个元素迭代
for movie,u_id,title in zip(movies,u_ids,titles):
# 获得每条影评的数据
info={
'movie':movie,
'u_id':u_id,
'title':title
}
douLists.append(info)
产生数个页面URL,保存数据
#列表表达式['https://movie.douban.com/review/best/?start={}'.format(str(i*20)) for i in range(10)]
urls=url
# print(urls)
for url in urls:
getDou(url)
# 将数据保存在dou.txt
for dou in douLists:
# print(dou)
# movie='电影名字:'+dou['movie']
# u_id='用户ID:'+dou['u_id']
# title='影评标题:'+dou['title']
f=open('./dou.txt','a+',encoding='utf-8')
try:
f.write('电影名字:'+dou['movie']+'\n\n')
f.write('用户id:'+dou['u_id'] + '\n\n')
f.write('影评标题:'+dou['title'] + '\n\n')
f.close()
except UnicodeEncodeError:
pass