import requests
from lxml import etree
import xlwt
url = 'https://movie.douban.com/top250?start=%d&filter='
headers = {
'User-Agent': '',
'Cookie': '',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7'
}
def get_info(start):
resp = requests.get(url % start, headers=headers)
content = resp.content.decode('utf-8')
return content
def parse_content(content, resultlist):
html = etree.HTML(content)
infoBlocks = html.xpath('//*[@id="content"]/div/div[1]/ol/*/div')
for element in infoBlocks:
resultlist.append(parse_info(element))
def parse_info(element):
picelement = element.xpath('./div[@class="pic"]')
# 电影链接
link = picelement[0].xpath('./a/@href')
# 图片链接
imgUrl = picelement[0].xpath('./a/img/@src')
infoelement = element.xpath('./div[@class="info"]')
hdelement = infoelement[0].xpath('./div[@class="hd"]')
# 片名
title = hdelement[0].xpath('./a/span[1]/text()')
# 别名
alias1 = hdelement[0].xpath('./a/span[2]/text()')
# 别名2
alias2 = hdelement[0].xpath('./a/span[3]/text()')
# 详情
bdelement = infoelement[0].xpath('./div[@class="bd"]')
# todo 从字段中解析出主演等信息
detailstr = bdelement[0].xpath('./p[1]/text()')
# 主演
# 上映时间
# 上映地点
# 类型
# 评分
score = bdelement[0].xpath('./div[1]/span[2]/text()')
# 评价人数
num = bdelement[0].xpath('./div[1]/span[4]/text()')
# 热评
hotcomment = bdelement[0].xpath('./p[2]/span/text()')
return {'link':link,'imgUrl':imgUrl,'title':title,'alias1':alias1,'alias2':alias2,
'detail':detailstr,'score':score, 'num':num,'hotcomment':hotcomment}
def save_2_excel(resultlist):
wb = xlwt.Workbook()
ws = wb.add_sheet('豆瓣点评top250')
title = {'link':'电影链接', 'imgUrl':'图片链接', 'title':'片名','alias1':'别名1','alias2':'别名2'
,'detail':'详情','score':'评分','num':'评价人数','hotcomment':'热评'}
for index,item in enumerate(title.keys()):
ws.write(0,index, title[item])
row = 1
for item in resultlist:
for index, key in enumerate(title.keys()):
ws.write(row, index,item[key])
row += 1
wb.save('豆瓣top250.xls')
def main():
resultlist = []
for i in range(10):
content = get_info(i * 25)
parse_content(content,resultlist)
save_2_excel(resultlist)
if __name__ == '__main__':
main()
Python电影排行并写入Excel
于 2023-05-02 09:53:15 首次发布