import requests
from bs4 import BeautifulSoup
import time
import csv
headers={"Content-Type": "text/html; charset=utf-8",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
"Cookie":'替换成你自己的'
}
pages = 1
for page in range(pages):
print(page)
url="https://www.douban.com/group/topic/替换成你自己的&start="+str(page*100)
try:
response = requests.get(url, headers=headers)
response.raise_for_status()
response.encoding = 'utf-8'
except requests.exceptions.RequestException as e:
print(f"请求失败: {e}")
continue
soup = BeautifulSoup(response.text, 'lxml')
comment_elements = soup.find_all('li', {'class': 'clearfix comment-item reply-item'})
for element in comment_elements:
#图片
img_tag=element.find('div', {'class': 'cmt-img'})
if img_tag!=None:
photo=str(img_tag.img['data-photo-url'])
else:
photo="None"
#ID和主页链接
a_tag = element.find('a')
name=str(a_tag.img['alt'])
link=str(a_tag['href'])
#评论内容
div_tag = element.find('div',{'class':'markdown'})
comment=str(div_tag.text.strip())
#发布时间
pubtime=str(element.find('span',{'class':'pubtime'}).text)
data=[name,pubtime,comment,photo,link]
# print(data)
with open('output.csv', 'a', encoding='utf-8-sig', newline='') as file:
writer = csv.writer(file)
writer.writerow(data)
time.sleep(2)
参考链接:python爬取豆瓣小组帖子评论_python爬帖子评论-优快云博客
修改部分:增加了评论人ID、评论人主页链接、评论发布时间、是否带图评论及图片链接