import requests
import re
import xlwt
headers={
'user-agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36 SLBrowser/9.0.3.5211 SLBChan/109',
'Cookie':'ll="118254"; bid=Ll8r0Dji-xM; _pk_id.100001.4cf6=8f10dc8b0fc8e3d4.1712311891.; __gads=ID=37ec7c913844b41f:T=1712311922:RT=1712311922:S=ALNI_MbFseYm2dKx7G7G5iW-iDpXTGgdKQ; __gpi=UID=00000de252817cd9:T=1712311922:RT=1712311922:S=ALNI_MbBP76xPjsjV0MSV52UXK3gr63Xrg; _vwo_uuid_v2=D34EFF203AC84A8BE0211A07D6608E70A|e87dfc600b0bc644813957497dc8d61c; __yadk_uid=1iSAUN50b0JkNNnsAI0Wr2F8P2z33DyU; __utmz=30149280.1731464865.3.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmz=223695111.1731464865.3.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); _vwo_uuid_v2=D34EFF203AC84A8BE0211A07D6608E70A|e87dfc600b0bc644813957497dc8d61c; ap_v=0,6.0; _pk_ses.100001.4cf6=1; __utma=30149280.242167028.1712311891.1731464865.1731589109.4; __utmb=30149280.0.10.1731589109; __utmc=30149280; __utma=223695111.1865643858.1712311891.1731464865.1731589109.4; __utmb=223695111.0.10.1731589109; __utmc=223695111'
}
urls=[]
movies=[]
for i in range(0,10,1):
url="https://movie.douban.com/top250?start={}&filter=".format(i*25)
urls.append(url)
# print(urls)
for url in urls:
response=requests.get(url,headers=headers)
contents=response.content.decode("utf-8")
# print(contents)
# break
links= re.findall('<a href="(.*?)">',contents,re.S)
titles=re.findall('<div class="hd">.*?<span class="title">(.*?)</span>',contents,re.S)
images=re.findall('<div class="pic">.*?<img.*?src="(.*?)">',contents,re.S)
ratings=re.findall('<div class="star">.*?<span class="rating_num" property="v:average">(.*?)</span>',contents,re.S)
people=re.findall('<div class="star">.*?<span>(.*?)</span>',contents,re.S)
summaries=re.findall('<div class="star">.*?<span class="inq">(.*?)</span>',contents,re.S)
informations=re.findall('<p class="">(.*?)</p>',contents,re.S)
new_informations = [re.sub(r' |<br>|\n| ', '', info).strip() for info in informations]
min_length = min(len(links), len(titles), len(images), len(ratings), len(people), len(summaries),
len(new_informations))
# 使用zip函数时,需要确保所有列表长度相同
for link, title, image, rating, people_count, summary, information in zip(links[:min_length], titles[:min_length],
images[:min_length], ratings[:min_length],
people[:min_length],
summaries[:min_length],
new_informations[:min_length]):
new_movie = {
"详情链接": link,
"影片片名": title.strip(),
"影片图片": image,
"影片评分": rating,
"评价人数": people_count,
"影片概况": summary.strip() if summary else "暂无简介",
"相关信息": information
}
movies.append(new_movie)
# print(movies)
# 打印所有电影信息
# for movie in movies:
# print(movie)
# print("爬取成功")
workbook = xlwt.Workbook(encoding='utf-8')
sheet1 = workbook.add_sheet('豆瓣电影排行')
ziduans = list(movies[0].keys())
print(ziduans)
for i in range(len(ziduans)):
sheet1.write(0,i,ziduans[i])
for row in range(1,len(movies)+1,1):
for col,key in zip(range(len(ziduans)),ziduans):
sheet1.write(row,col,movies[row-1][key])
workbook.save(r"豆瓣电影.xls")
爬取豆瓣电影排行榜页面信息,根据豆瓣电影TOP250榜单,构造抓取的起始页面地址,采集每一页的电影排行信息,具体包括每部电影的详情链接,影片图片,影片片名,影片评分,评价人数、影片概况、相关信息。
于 2024-11-14 09:24:55 首次发布