# -*- encoding:utf-8 -*-
#豆瓣-封神第一部:朝歌风云 短评
from lxml import etree
import csv
import requests
from tqdm import tqdm #(这个模块只是单纯想加,也可不加,显示进度条的)
print('信息爬取中:\n')
class HouseParse(object):
# 初始化
def __init__(self):
# 请求头
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.55 Safari/537.36 Edg/96.0.1054.43',
'cookie':''
# f12抓包(不加cookie爬不到后面页)需要进行登录
}
# 列表存放数据
self.data_list = []
def Sponsor(self):
# 翻页数据
for i in tqdm(range(30)):
url = f'https://movie.douban.com/subject/10604086/comments?start={i}&limit=20&status=P&sort=new_score'
response = requests.get(url=url, headers=self.headers)
# 返回响应码
# print(response.status_code)
#返回响应码
# print(response.status_code)
html = etree.HTML(response.content.decode('utf-8-sig'))
#找到内容所在的div标签
elements = html.xpath('//div[@class="comment"]')
print(elements)
for element in elements:
#创建字典
dict_ = {}
#有用
dict_['有用'] = element.xpath('./h3/span[1]/span[@class="votes vote-count"]/text()')[0]
#推荐
dict_['推荐指数'] =element.xpath('./h3/span[2]/span[2]/@title')[0]
dict_['观影时间'] =element.xpath('./h3/span[2]/span[3]/@title')[0]
dict_['观影地址']=element.xpath('./h3/span[2]/span[4]/text()')[0]
dict_['评论'] = element.xpath('./p/span/text()')[0]
print(dict_)
self.data_list.append(dict_)
def save_data(self):
# 保存数据
with open('fs.csv', 'w', encoding='utf-8-sig', newline='')as f:
writer = csv.DictWriter(f, fieldnames=['有用', '推荐指数','观影时间','观影地址','评论'])
writer.writeheader()
writer.writerows(self.data_list)
def main(self):
self.Sponsor()
self.save_data()
if __name__ == '__main__':
house = HouseParse()
house.main()
print('\n爬取成功!')
上次用的正则,才发现xpath好像更容易
本文介绍了使用Python爬虫技术从豆瓣电影《封神演义》页面抓取评论信息的方法,包括设置请求头、使用XPath解析HTML内容,以及将数据保存到CSV文件的过程。
22万+

被折叠的 条评论
为什么被折叠?



