import time
import json
import csv
import random
import logging
from datetime import datetime
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import NoSuchElementException, WebDriverException, ElementClickInterceptedException, SessionNotCreatedException
from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import re
import traceback
# 配置日志记录
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler(f'scraper_log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log', encoding='utf-8'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
def read_video_urls(csv_file):
"""从 CSV 文件读取视频链接"""
logger.info(f"正在读取视频链接文件: {csv_file}")
urls = []
try:
with open(csv_file, "r", encoding="utf-8-sig") as f:
reader = csv.reader(f)
for row in reader:
if row:
urls.append(row[0].strip())
logger.info(f"成功加载 {len(urls)} 个视频链接")
except Exception as e:
logger.error(f"读取视频链接失败: {str(e)}")
return urls
def load_cookies(driver, cookie_file="cookies.json"):
"""导入 cookies"""
logger.info(f"正在加载 cookies 文件: {cookie_file}")
try:
with open(cookie_file, "r", encoding="utf-8") as f:
cookies = json.load(f)
driver.get("https://www.douyin.com/")
time.sleep(3)
for cookie in cookies:
cookie.pop("sameSite", None)
if "expiry" in cookie:
cookie["expires"] = cookie.pop("expiry")
driver.add_cookie(cookie)
driver.refresh()
time.sleep(3)
logger.info("cookies 加载成功")
except Exception as e:
logger.error(f"加载 cookies 失败: {str(e)}")
def create_driver():
"""创建优化的浏览器配置"""
logger.info("正在创建 Chrome WebDriver 实例")
opts = Options()
opts.add_argument("--headless=new")
opts.add_argument("--disable-gpu")
opts.add_argument("--no-sandbox")
opts.add_argument("--disable-dev-shm-usage")
opts.add_argument("--disable-extensions")
opts.add_argument("--disable-plugins")
opts.add_argument("--disable-web-security")
opts.add_argument("--disable-features=VizDisplayCompositor")
opts.add_argument("--memory-pressure-off")
opts.add_argument("--max_old_space_size=4096")
opts.add_argument("--window-size=1920,1080")
opts.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
retries = 3
for attempt in range(1, retries + 1):
try:
driver = webdriver.Chrome(options=opts)
driver.implicitly_wait(10)
logger.info("WebDriver 创建成功")
return driver
except SessionNotCreatedException as e:
logger.error(f"创建 WebDriver 失败 (尝试 {attempt}/{retries}): {str(e)}")
if attempt == retries:
raise
time.sleep(2)
return None
def load_all_comments(driver, url):
"""加载所有评论(一级和二级)"""
logger.info(f"开始加载评论: {url}")
start_time = time.time()
driver.get(url)
time.sleep(3)
def get_comment_container():
try:
wait = WebDriverWait(driver, 15)
container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-e2e='comment-list']")))
return container
except:
try:
container = driver.find_element(By.CSS_SELECTOR, "div[class*='comment']")
return container
except NoSuchElementException:
try:
scrollable = driver.execute_script("""
return document.querySelector('div[style*="overflow-y: auto"], div[style*="overflow-y: scroll"]') ||
document.querySelector('div[class*="comment"]');
""")
if scrollable:
return scrollable
except:
return driver.find_element(By.TAG_NAME, 'body')
comment_selector = "div[data-e2e='comment-item']"
comments_with_replies = []
scroll_count = 0
last_comment_count = 0
no_new_comments_count = 0
logger.info("开始加载一级评论")
while True: # 持续滚动直到检测到"暂无更多评论"
comment_container = get_comment_container()
# 关闭弹窗
if scroll_count % 5 == 0:
try:
close_buttons = driver.find_elements(By.CSS_SELECTOR, "button[class*='close'], div[class*='modal-close'], div[class*='popup-close']")
for button in close_buttons:
button.click()
time.sleep(0.5)
except Exception:
pass
# 滚动评论区
try:
driver.execute_script("arguments[0].scrollBy(0, 3000);", comment_container)
except Exception:
try:
actions = ActionChains(driver)
actions.move_to_element(comment_container).send_keys(Keys.PAGE_DOWN).perform()
except Exception:
driver.execute_script("window.scrollBy(0, 3000);")
time.sleep(1.5)
current_comments = driver.find_elements(By.CSS_SELECTOR, comment_selector)
logger.info(f"迭代 {scroll_count}: 已加载 {len(current_comments)} 条一级评论")
# 检查是否到达底部
try:
no_more = driver.find_element(By.CSS_SELECTOR, "div.fanRMYie.cDj65BDb")
if "暂无更多评论" in no_more.text:
logger.info("检测到'暂无更多评论',停止加载一级评论")
break
except NoSuchElementException:
pass
# 检查是否有新评论
if len(current_comments) == last_comment_count:
no_new_comments_count += 1
if no_new_comments_count >= 5: # 增加重试次数
logger.info("连续5次没有新评论,继续尝试")
no_new_comments_count = 0 # 重置计数器继续尝试
else:
no_new_comments_count = 0
last_comment_count = len(current_comments)
# 检查是否有二级评论
for comment in current_comments:
try:
# 检查是否有展开回复按钮
reply_expand = comment.find_elements(By.CSS_SELECTOR, "button.VZWu521O")
if reply_expand:
comments_with_replies.append(comment)
except Exception:
pass
scroll_count += 1
time.sleep(random.uniform(1.0, 2.0))
# 加载二级评论
logger.info(f"找到 {len(comments_with_replies)} 条有二级评论的一级评论,开始加载二级评论")
for i, comment in enumerate(tqdm(comments_with_replies, desc="加载二级评论")):
try:
driver.execute_script("arguments[0].scrollIntoView(true);", comment)
time.sleep(1.0)
# 首次展开二级评论
reply_expand = comment.find_elements(By.CSS_SELECTOR, "button.VZWu521O")
if reply_expand:
reply_expand[0].click()
time.sleep(1.5)
while True:
try:
# 检查"展开更多"按钮
expand_more = comment.find_elements(By.CSS_SELECTOR, "button.bgz8RRCZ")
# 检查"收起"按钮
collapse = comment.find_elements(By.CSS_SELECTOR, "button.AlC_XilC.bgz8RRCZ")
# 如果同时存在展开更多和收起,说明还有更多评论
if expand_more and "展开更多" in expand_more[0].text:
expand_more[0].click()
time.sleep(1.5)
# 如果只有收起按钮,说明已加载完所有评论
elif collapse and "收起" in collapse[0].text and not expand_more:
break
else:
break
except (NoSuchElementException, ElementClickInterceptedException):
break
except Exception as e:
logger.error(f"加载二级评论时出错: {str(e)}")
break
except Exception as e:
logger.error(f"处理评论 {i + 1} 时出错: {str(e)}")
logger.info(f"评论加载完成,耗时 {time.time() - start_time:.2f} 秒")
return driver.page_source
def extract_comment_content(block):
"""提取评论内容,包括表情"""
try:
content_parts = []
seen_text = set()
if block:
# 提取文本内容
for child in block.children:
if child.name == 'span':
text = child.get_text(strip=True)
if text and text not in seen_text:
content_parts.append(text)
seen_text.add(text)
elif child.name == 'img' and child.get('alt'):
alt = child.get('alt', '')
if alt:
content_parts.append(alt)
seen_text.add(alt)
# 备用提取:直接查找所有图片的表情
if not content_parts:
for img in block.find_all('img'):
alt = img.get('alt', '')
if alt and alt not in seen_text:
content_parts.append(alt)
seen_text.add(alt)
content = ''.join(content_parts) if content_parts else "无内容"
return content
except Exception as e:
logger.error(f"提取评论内容失败: {str(e)}")
return "无内容"
def parse_comments(html, video_url):
"""解析评论"""
logger.info(f"开始解析评论: {video_url}")
start_time = time.time()
soup = BeautifulSoup(html, "html.parser")
comment_items = soup.find_all("div", {"data-e2e": "comment-item"})
results = []
primary_index = 0
processed_comments = set()
for block in tqdm(comment_items, desc="解析评论"):
try:
is_primary = not block.find_parent("div", class_=re.compile(r'replyContainer'))
if is_primary:
primary_index += 1
a_tag = block.select_one("a[href*='/user/'] span.j5WZzJdp span span span") or block.select_one("a[href*='/user/'] span")
user = a_tag.get_text(strip=True) if a_tag else "未知用户"
user = re.sub(r'[^\w\s]', '', user)
cont_div = block.select_one("div.LvAtyU_f")
content = extract_comment_content(cont_div)
like_span = block.select_one("span[data-e2e='comment-like-count']") or block.select_one("p.wiQmZrKV span")
likes = like_span.get_text(strip=True) if like_span else "0"
secondary_comments = block.select("div.cKvms_3E.replyContainer div[data-e2e='comment-item']")
secondary_count = len(secondary_comments)
comment_key = f"primary_{primary_index}_{user}_{content}"
if comment_key not in processed_comments:
primary_comment = {
"一级评论_序号": primary_index,
"一级评论_账号": user,
"一级评论_评论内容": content,
"一级评论_赞数量": likes,
"一级评论_裂开数量": "0",
"一级评论_子评论数量": secondary_count,
"二级评论_序号": "",
"二级评论_账号": "",
"二级评论_评论内容": "",
"二级评论_点赞数量": "",
"二级评论_裂开数量": ""
}
results.append(primary_comment)
processed_comments.add(comment_key)
if secondary_comments:
secondary_index = 0
for secondary in secondary_comments:
secondary_index += 1
sec_a_tag = secondary.select_one("a[href*='/user/'] span.j5WZzJdp span span span") or secondary.select_one("a[href*='/user/'] span")
sec_user = sec_a_tag.get_text(strip=True) if sec_a_tag else "未知用户"
sec_user = re.sub(r'[^\w\s]', '', sec_user)
sec_cont_div = secondary.select_one("div.LvAtyU_f")
sec_content = extract_comment_content(sec_cont_div)
sec_like_span = secondary.select_one("span[data-e2e='comment-like-count']") or secondary.select_one("p.wiQmZrKV span")
sec_likes = sec_like_span.get_text(strip=True) if sec_like_span else "0"
comment_key = f"secondary_{primary_index}_{secondary_index}_{sec_user}_{sec_content}"
if comment_key not in processed_comments:
secondary_comment = {
"一级评论_序号": primary_index,
"一级评论_账号": "",
"一级评论_评论内容": "",
"一级评论_赞数量": "",
"一级评论_裂开数量": "",
"一级评论_子评论数量": "",
"二级评论_序号": secondary_index,
"二级评论_账号": sec_user,
"二级评论_评论内容": sec_content,
"二级评论_点赞数量": sec_likes,
"二级评论_裂开数量": "0"
}
results.append(secondary_comment)
processed_comments.add(comment_key)
except Exception as e:
logger.error(f"解析第 {primary_index + 1} 条一级评论时出错: {str(e)}")
logger.info(f"解析完成,共解析 {len(results)} 条评论,耗时 {time.time() - start_time:.2f} 秒")
return results
def main():
"""主逻辑"""
logger.info("开始爬取任务")
start_time = time.time()
video_urls = read_video_urls("test.csv")
all_comments = []
for i, url in enumerate(tqdm(video_urls, desc="处理视频")):
logger.info(f"[{i + 1}/{len(video_urls)}] 正在抓取视频: {url}")
start_video_time = time.time()
driver = create_driver()
try:
load_cookies(driver, "cookies.json")
html = load_all_comments(driver, url)
comments = parse_comments(html, url)
logger.info(f"[{i + 1}/{len(video_urls)}] 抓取到 {len(comments)} 条评论,耗时 {time.time() - start_video_time:.2f} 秒")
all_comments.extend(comments)
# 保存中间结果
with open(f"intermediate_results_{i + 1}.csv", "w", encoding="utf-8-sig", newline="") as f:
writer = csv.DictWriter(f, fieldnames=[
"一级评论_序号", "一级评论_账号", "一级评论_评论内容", "一级评论_赞数量", "一级评论_裂开数量",
"一级评论_子评论数量", "二级评论_序号", "二级评论_账号", "二级评论_评论内容", "二级评论_点赞数量",
"二级评论_裂开数量"
])
writer.writeheader()
writer.writerows(comments)
logger.info(f"[{i + 1}/{len(video_urls)}] 中间结果已保存至 intermediate_results_{i + 1}.csv")
except Exception as e:
logger.error(f"[{i + 1}/{len(video_urls)}] 处理视频失败: {url} - {str(e)}")
finally:
driver.quit()
# 保存最终结果
keys = [
"一级评论_序号", "一级评论_账号", "一级评论_评论内容", "一级评论_赞数量", "一级评论_裂开数量",
"一级评论_子评论数量", "二级评论_序号", "二级评论_账号", "二级评论_评论内容", "二级评论_点赞数量",
"二级评论_裂开数量"
]
with open("comments.csv", "w", encoding="utf-8-sig", newline="") as f:
writer = csv.DictWriter(f, fieldnames=keys)
writer.writeheader()
writer.writerows(all_comments)
logger.info(f"最终结果已保存至 comments1.csv,共 {len(all_comments)} 条评论,耗时 {time.time() - start_time:.2f} 秒")
logger.info(f"爬取任务完成,耗时 {time.time() - start_time:.2f} 秒")
if __name__ == "__main__":
main()
y_conda_envs\Scrape\python.exe F:\Scrape\Douyin_scrape\二级存表情.py
2025-06-27 19:35:59,432 [INFO] 开始爬取任务
2025-06-27 19:35:59,432 [INFO] 正在读取视频链接文件: test.csv
2025-06-27 19:35:59,432 [INFO] 成功加载 1 个视频链接
处理视频: 0%| | 0/1 [00:00<?, ?it/s]2025-06-27 19:35:59,438 [INFO] [1/1] 正在抓取视频: https://v.douyin.com/6PfJULVgFxE
2025-06-27 19:35:59,438 [INFO] 正在创建 Chrome WebDriver 实例
2025-06-27 19:36:01,019 [INFO] WebDriver 创建成功
2025-06-27 19:36:01,019 [INFO] 正在加载 cookies 文件: cookies.json
2025-06-27 19:36:26,258 [INFO] cookies 加载成功
2025-06-27 19:36:26,258 [INFO] 开始加载评论: https://v.douyin.com/6PfJULVgFxE
2025-06-27 19:36:30,962 [INFO] 开始加载一级评论
2025-06-27 19:36:42,523 [INFO] 迭代 0: 已加载 6 条一级评论
2025-06-27 19:37:56,001 [INFO] 迭代 1: 已加载 6 条一级评论
2025-06-27 19:39:09,790 [INFO] 迭代 2: 已加载 6 条一级评论
2025-06-27 19:40:23,472 [INFO] 迭代 3: 已加载 6 条一级评论
为什么程序在第一次迭代后 就没有爬取更多评论了 请检查是否滚动页面 爬取所有一级评论 帮我完善代码
最新发布