1、Beautiful Soup
的安装:
pip install beautifulsoup4
pip install lxml
2、爬取代码:
import pandas as pd
import requests
from bs4 import BeautifulSoup
# 模拟浏览器头部信息,向豆瓣服务器发送消息
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/121.0.0.0 Safari/537.36'}
target = "https://movie.douban.com/subject/35890350/comments"
# 爬取页数
max_pages = 20
comments = []
for page in range(max_pages):
# 获取HTML页面信息
response = requests.get(url = target
+ '?start=' + str(page * 20)
, hea