记一次反 CSS反爬

目标网址:猫眼电影
主要流程
  1. 爬取每一个电影所对应的url
  2. 爬取具体电影所对应的源码
  3. 解析源码,并下载所对应的字体
  4. 使用 fontTools 绘制所对应的数字
  5. 运用机器学习的方法识别对应的数字
  6. 在源码中用识别的数字替换相应的地方
遇坑经历
  • 用 pyquery 的 .text() 方法的时候自动对 html 进行了反转义,替换过程失败,直接打出来一堆乱码,不得已改用 lxml
  • 一开始在网上看到很多通过将 woff 字体文件转为 xml 并通过分析 xml 中的 glyf 来判断所对应的汉字,但是看了几个之后发现即使是相同的数字所对应的 x,y值也并不是完全相同
  • 对多线程的用法还不是很熟悉
项目列表如下

在这里插入图片描述

具体代码如下

Myspider.py
import requests
import fake_useragent
import re
import os
from woffToPng import woff_to_image
from resizeImage import resize_img
from lxml import etree
from html import unescape
from ThreadClass import SpiderThread
from SaveMovieData import SaveInfo
from pyquery import PyQuery as pq
from pprint import pprint
import json
import time


# 用协程实现有点苦难,暂时试一试多线程

# 存储urls
film_urls = []


def verify_img(img_dir):
    api_url = "http://127.0.0.1:6000/b"
    img_to_num = {}
    for file in os.listdir(img_dir):
        file_name = os.path.join(img_dir, file)
        # 重新构建图片大小
        resize_img(file_name, file_name)
        files = {"image_file": ("image_file", open(file_name, "rb"), "application")}
        r = requests.post(url=api_url, files=files, timeout=None)
        if r.status_code == 200:
            # 获得图片的名字,即数字所对应的unicode编码
            num_id = os.path.splitext(file)[0][3:]
            img_to_num[str(int(num_id, 16))] = r.json().get("value")
    return img_to_num


def find_certain_part(html, xpath_format):
    try:
        return html.xpath(xpath_format)[0]
    except Exception:
        return "null"

def parse_data_by_lxml(source_code, img_to_num, saver):
    html = etree.HTML(source_code)

    xpaths = json.loads(open("somexpaths.json", "r").read())

    movie_name = find_certain_part(html, xpaths.get("movie_name"))
    movie_ename = find_certain_part(html, xpaths.get("movie_ename"))
    movie_classes = find_certain_part(html, xpaths.get("movie_classes")).strip()
    movie_length = find_certain_part(html, xpaths.get("movie_length")).strip()
    movie_showtime = find_certain_part(html, xpaths.get("movie_showtime")).strip()


    text_pattern = re.compile('.*?class="stonefont">(.*?)</span>')
    data_to_be_replace = []

    movie_score = find_certain_part(html, xpaths.get("movie_score"))
    movie_score_num = find_certain_part(html, xpaths.get("movie_score_num"))
    if movie_score != "null":
        movie_score = text_pattern.search(etree.tostring(movie_score).decode("utf8")).group(1)
    if movie_score_num != "null":
        movie_score_num = text_pattern.search(etree.tostring(movie_score_num).decode("utf8")).group(1)

    data_to_be_replace.append(movie_score)
    data_to_be_replace.append(movie_score_num)

    movie_box = find_certain_part(html, xpaths.get("movie_box"))
    if movie_box != "null":
        movie_box = text_pattern.search(etree.tostring(movie_box).decode("utf8")).group(1)
    movie_box_unit = find_certain_part(html, xpaths.get("movie_box_unit"))

    data_to_be_replace.append(movie_box)


    # 检查是否是字符串
    for item in data_to_be_replace:
        assert isinstance(item, str)
    # 将 unicode 编码的字符串转化为数字
    for key, value in img_to_num.items():
        new_key = f"&#{key};"
        for i in range(len(data_to_be_replace)):
            if data_to_be_replace[i] == "null":
                continue
            if new_key in data_to_be_replace[i]:
                data_to_be_replace[i] = data_to_be_replace[i].replace(new_key, value)

    movie_score, movie_score_num, movie_box = [unescape(item) for item in data_to_be_replace]
    # 没有评分的当作0
    if movie_score == "null":
        movie_score = "0"
    if movie_box != "null":
        movie_box = movie_box + movie_box_unit.strip()

    movie_brief_info = find_certain_part(html, xpaths.get("movie_brief_info"))
    assert(isinstance(movie_brief_info, str))
    # 这里的实现逻辑有一点问题,因为只是默认第一个是导演
    movie_director, *movie_actors = [item.strip() for item in html.xpath("//body//div[@id='app']//div//div//div//div[@class='tab-content-container']//div//div[@class='mod-content']//div//div//ul//li//div//a/text()")]
    movie_actors = ",".join(movie_actors)

    movie_comments = {}
    try:
        names = html.xpath("//body//div[@id='app']//div//div//div//div//div[@class='module']//div[@class='mod-content']//div[@class='comment-list-container']//ul//li//div//div[@class='user']//span[@class='name']/text()")
        comments = html.xpath("//body//div[@id='app']//div//div//div//div//div[@class='module']//div[@class='mod-content']//div[@class='comment-list-container']//ul//li//div[@class='main']//div[@class='comment-content']/text()")
        assert(len(names) == len(comments))
        for name, comment in zip(names, comments):
            movie_comments[name] = comment
    except Exception:
        pass

    save_id = saver.insert_dict({
        "名称": movie_name,
        "别名": movie_ename,
        "类别": movie_classes,
        "时长": movie_length,
        "上映时间": movie_showtime,
        "评分": float(movie_score),
        "评分人数": movie_score_num,
        "票房": movie_box,
        "简介": movie_brief_info,
        "导演": movie_director,
        "演员": movie_actors,
        "热门评论": movie_comments
    })
    print(f"{save_id} 保存成功")

# 爬取源码,在获得源码之后处理字体文件,处理完字体文件之后进行替换
def get_one_film(url, ua, film_id, saver):
    headers = {
        "User-Agent": ua,
        "Host": "maoyan.com"
    }
    r = requests.get(url=url, headers=headers)
    if r.status_code == 200:
        source_code = r.text
        font_pattern = re.compile("url\(\'(.*?\.woff)\'\)")
        font_url = "http:" + font_pattern.search(r.text).group(1).strip()
        del headers["Host"]
        res = requests.get(url=font_url, headers=headers)
        # 下载字体并进行识别对应
        if res.status_code == 200:
            if os.path.exists(film_id):
                os.system(f"rmdir /s /q {film_id}")
            os.makedirs(film_id)
            woff_path = os.path.join(film_id, "temp.woff")
            img_dir = os.path.join(film_id, "images")
            os.makedirs(img_dir)
            with open(woff_path, "wb") as f:
                f.write(res.content)
            woff_to_image(woff_path, img_dir)
            # 以后试着用协程实现汉字识别
            # 先直接识别
            # 用字典存储,{"img_id": "img_num"}
            img_to_num = verify_img(img_dir)
            # 删除所创建的文件
            os.system(f"rmdir /s /q {film_id}")
            # 对所获得的数据和可以替换的信息进行进一步的处理
            parse_data_by_lxml(source_code, img_to_num, saver)

def get_urls(url, ua, showType, offset):

    base_url = "https://maoyan.com"
    headers = {
        "User-Agent": ua,
        "Host": "maoyan.com"
    }

    params = {
        "showType": showType,
        "offset": offset
    }

    urls = []
    r = requests.get(url=url, headers=headers, params=params)
    if r.status_code == 200:
        doc = pq(r.text)
        for re_url in doc("#app div div[class='movies-list'] dl dd div[class='movie-item'] a[target='_blank']").items():
            urls.append(base_url + re_url.attr("href"))
    film_urls.extend(urls)
    print(f"当前捕获url{len(film_urls)}个")


if __name__ == "__main__":
    # 测试
    ua = fake_useragent.UserAgent()
    tasks_one = []
    try:
        for i in range(68):
            tasks_one.append(SpiderThread(get_urls, args=("https://maoyan.com/films", ua.random, "3", str(30*i))))
        for task in tasks_one:
            task.start()
        for task in tasks_one:
            task.join()
    except Exception as e:
        print(e.args)
    saver = SaveInfo()
    film_ids = [url.split("/")[-1] for url in film_urls]
    print(f"捕获电影url共{len(film_urls)}条")
    tasks_two = []
    count = 0
    try:
        for film_url, film_id in zip(film_urls, film_ids):
            tasks_two.append(SpiderThread(get_one_film, args=(film_url, ua.random, film_id, saver)))
        for task in tasks_two:
            task.start()
            count += 1
            if count % 4 == 0:
                time.sleep(5)
        for task in tasks_two:
            task.join()
    except Exception as e:
        print(e.args)
    print("抓取完毕")

resizeimage.py
from PIL import Image
import os

def resize_img(img_path, write_path):
    crop_size = (120, 200)
    img = Image.open(img_path)
    new_img = img.resize(crop_size, Image.ANTIALIAS)
    new_img.save(write_path, quality=100)

if __name__ == "__main__":
    for root, dirs, files in os.walk("verify_images"):
        for file in files:
            img_path = os.path.join(root, file)
            write_path = os.path.join("resized_images", file)
            resize_img(img_path, write_path)
SaveMovieData.py
import pymongo


class SaveInfo:

    def __init__(self, host="localhost", port=27017, db="MovieSpider",
                 collection="maoyan"):
        self._client = pymongo.MongoClient(host=host, port=port)
        self._db = self._client[db]
        self._collection = self._db[collection]

    def insert_dict(self, data: dict):
        result = self._collection.insert_one(data)
        return result.inserted_id
woffToPng.py
from __future__ import print_function, division, absolute_import
from fontTools.ttLib import TTFont
from fontTools.pens.basePen import BasePen
from reportlab.graphics.shapes import Path
from reportlab.lib import colors
from reportlab.graphics import renderPM
from reportlab.graphics.shapes import Group, Drawing


class ReportLabPen(BasePen):
    """A pen for drawing onto a reportlab.graphics.shapes.Path object."""

    def __init__(self, glyphSet, path=None):
        BasePen.__init__(self, glyphSet)
        if path is None:
            path = Path()
        self.path = path

    def _moveTo(self, p):
        (x, y) = p
        self.path.moveTo(x, y)

    def _lineTo(self, p):
        (x, y) = p
        self.path.lineTo(x, y)

    def _curveToOne(self, p1, p2, p3):
        (x1, y1) = p1
        (x2, y2) = p2
        (x3, y3) = p3
        self.path.curveTo(x1, y1, x2, y2, x3, y3)

    def _closePath(self):
        self.path.closePath()


def woff_to_image(fontName, imagePath, fmt="png"):
    font = TTFont(fontName)
    gs = font.getGlyphSet()
    glyphNames = font.getGlyphNames()
    for i in glyphNames:
        if i == 'x' or i == "glyph00000":  # 跳过'.notdef', '.null'
            continue

        g = gs[i]
        pen = ReportLabPen(gs, Path(fillColor=colors.black, strokeWidth=5))
        g.draw(pen)
        w, h = 600, 1000
        g = Group(pen.path)
        g.translate(0, 200)

        d = Drawing(w, h)
        d.add(g)
        imageFile = imagePath + "/" + i + ".png"
        renderPM.drawToFile(d, imageFile, fmt)

import threading


class SpiderThread(threading.Thread):

    def __init__(self, func, args=()):
        super().__init__()
        self.func = func
        self.args = args

    def run(self) -> None:
        self.result = self.func(*self.args)

    # 相当于没有多线程
    # def get_result(self):
    #     threading.Thread.join(self)
    #     try:
    #         return self.result
    #     except Exception as e:
    #         print(e.args)
    #         return None

somexpaths.json
{
  "movie_name": "//body//div[@class='banner']//div//div[@class='movie-brief-container']//h3/text()",
  "movie_ename": "//body//div[@class='banner']//div//div[@class='movie-brief-container']//div/text()",
  "movie_classes": "//body//div[@class='banner']//div//div[@class='movie-brief-container']//ul//li[1]/text()",
  "movie_length": "//body//div[@class='banner']//div//div[@class='movie-brief-container']//ul//li[2]/text()",
  "movie_showtime": "//body//div[@class='banner']//div//div[@class='movie-brief-container']//ul//li[3]/text()",
  "movie_score": "//body//div[@class='banner']//div//div[@class='movie-stats-container']//div//span[@class='index-left info-num ']//span",
  "movie_score_num": "//body//div[@class='banner']//div//div[@class='movie-stats-container']//div//span[@class='score-num']//span",
  "movie_box": "//body//div[@class='wrapper clearfix']//div//div//div//div[@class='movie-index-content box']//span[@class='stonefont']",
  "movie_box_unit": "//body//div[@class='wrapper clearfix']//div//div//div//div[@class='movie-index-content box']//span[@class='unit']/text()",
  "movie_brief_info": "//body//div[@class='container']//div//div//div//div[@class='tab-content-container']//div//div//div[@class='mod-content']//span[@class='dra']/text()",
  "movie_director_and_actress": "//body//div[@id='app']//div//div//div//div[@class='tab-content-container']//div//div[@class='mod-content']//div//div//ul//li//div//a/text()",
  "commenter_names": "//body//div[@id='app']//div//div//div//div//div[@class='module']//div[@class='mod-content']//div[@class='comment-list-container']//ul//li//div//div[@class='user']//span[@class='name']/text()",
  "commenter_comment": "//body//div[@id='app']//div//div//div//div//div[@class='module']//div[@class='mod-content']//div[@class='comment-list-container']//ul//li//div[@class='main']//div[@class='comment-content']/text()"
}
参考资料
import time import json import csv import random import logging from datetime import datetime from tqdm import tqdm from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.common.action_chains import ActionChains from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import NoSuchElementException, WebDriverException, ElementClickInterceptedException, SessionNotCreatedException from bs4 import BeautifulSoup from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC import re import traceback # 配置日志录 logging.basicConfig( level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', handlers=[ logging.FileHandler(f'scraper_log_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log', encoding='utf-8'), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) def read_video_urls(csv_file): """从 CSV 文件读取视频链接""" logger.info(f"正在读取视频链接文件: {csv_file}") urls = [] try: with open(csv_file, "r", encoding="utf-8-sig") as f: reader = csv.reader(f) for row in reader: if row: urls.append(row[0].strip()) logger.info(f"成功加载 {len(urls)} 个视频链接") except Exception as e: logger.error(f"读取视频链接失败: {str(e)}") return urls def load_cookies(driver, cookie_file="cookies.json"): """导入 cookies""" logger.info(f"正在加载 cookies 文件: {cookie_file}") try: with open(cookie_file, "r", encoding="utf-8") as f: cookies = json.load(f) driver.get("https://www.douyin.com/") time.sleep(3) for cookie in cookies: cookie.pop("sameSite", None) if "expiry" in cookie: cookie["expires"] = cookie.pop("expiry") driver.add_cookie(cookie) driver.refresh() time.sleep(3) logger.info("cookies 加载成功") except Exception as e: logger.error(f"加载 cookies 失败: {str(e)}") def create_driver(): """创建优化的浏览器配置""" logger.info("正在创建 Chrome WebDriver 实例") opts = Options() opts.add_argument("--headless=new") opts.add_argument("--disable-gpu") opts.add_argument("--no-sandbox") opts.add_argument("--disable-dev-shm-usage") opts.add_argument("--disable-extensions") opts.add_argument("--disable-plugins") opts.add_argument("--disable-web-security") opts.add_argument("--disable-features=VizDisplayCompositor") opts.add_argument("--memory-pressure-off") opts.add_argument("--max_old_space_size=4096") opts.add_argument("--window-size=1920,1080") opts.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36") retries = 3 for attempt in range(1, retries + 1): try: driver = webdriver.Chrome(options=opts) driver.implicitly_wait(10) logger.info("WebDriver 创建成功") return driver except SessionNotCreatedException as e: logger.error(f"创建 WebDriver 失败 (尝试 {attempt}/{retries}): {str(e)}") if attempt == retries: raise time.sleep(2) return None def load_all_comments(driver, url): """加载所有评论(一级和二级)""" logger.info(f"开始加载评论: {url}") start_time = time.time() driver.get(url) time.sleep(3) def get_comment_container(): try: wait = WebDriverWait(driver, 15) container = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "div[data-e2e='comment-list']"))) return container except: try: container = driver.find_element(By.CSS_SELECTOR, "div[class*='comment']") return container except NoSuchElementException: try: scrollable = driver.execute_script(""" return document.querySelector('div[style*="overflow-y: auto"], div[style*="overflow-y: scroll"]') || document.querySelector('div[class*="comment"]'); """) if scrollable: return scrollable except: return driver.find_element(By.TAG_NAME, 'body') comment_selector = "div[data-e2e='comment-item']" comments_with_replies = [] scroll_count = 0 last_comment_count = 0 no_new_comments_count = 0 logger.info("开始加载一级评论") while True: # 持续滚动直到检测到"暂无更多评论" comment_container = get_comment_container() # 关闭弹窗 if scroll_count % 5 == 0: try: close_buttons = driver.find_elements(By.CSS_SELECTOR, "button[class*='close'], div[class*='modal-close'], div[class*='popup-close']") for button in close_buttons: button.click() time.sleep(0.5) except Exception: pass # 滚动评论区 try: driver.execute_script("arguments[0].scrollBy(0, 3000);", comment_container) except Exception: try: actions = ActionChains(driver) actions.move_to_element(comment_container).send_keys(Keys.PAGE_DOWN).perform() except Exception: driver.execute_script("window.scrollBy(0, 3000);") time.sleep(1.5) current_comments = driver.find_elements(By.CSS_SELECTOR, comment_selector) logger.info(f"迭代 {scroll_count}: 已加载 {len(current_comments)} 条一级评论") # 检查是否到达底部 try: no_more = driver.find_element(By.CSS_SELECTOR, "div.fanRMYie.cDj65BDb") if "暂无更多评论" in no_more.text: logger.info("检测到'暂无更多评论',停止加载一级评论") break except NoSuchElementException: pass # 检查是否有新评论 if len(current_comments) == last_comment_count: no_new_comments_count += 1 if no_new_comments_count >= 5: # 增加重试次数 logger.info("连续5次没有新评论,继续尝试") no_new_comments_count = 0 # 重置计数器继续尝试 else: no_new_comments_count = 0 last_comment_count = len(current_comments) # 检查是否有二级评论 for comment in current_comments: try: # 检查是否有展开回复按钮 reply_expand = comment.find_elements(By.CSS_SELECTOR, "button.VZWu521O") if reply_expand: comments_with_replies.append(comment) except Exception: pass scroll_count += 1 time.sleep(random.uniform(1.0, 2.0)) # 加载二级评论 logger.info(f"找到 {len(comments_with_replies)} 条有二级评论的一级评论,开始加载二级评论") for i, comment in enumerate(tqdm(comments_with_replies, desc="加载二级评论")): try: driver.execute_script("arguments[0].scrollIntoView(true);", comment) time.sleep(1.0) # 首次展开二级评论 reply_expand = comment.find_elements(By.CSS_SELECTOR, "button.VZWu521O") if reply_expand: reply_expand[0].click() time.sleep(1.5) while True: try: # 检查"展开更多"按钮 expand_more = comment.find_elements(By.CSS_SELECTOR, "button.bgz8RRCZ") # 检查"收起"按钮 collapse = comment.find_elements(By.CSS_SELECTOR, "button.AlC_XilC.bgz8RRCZ") # 如果同时存在展开更多和收起,说明还有更多评论 if expand_more and "展开更多" in expand_more[0].text: expand_more[0].click() time.sleep(1.5) # 如果只有收起按钮,说明已加载完所有评论 elif collapse and "收起" in collapse[0].text and not expand_more: break else: break except (NoSuchElementException, ElementClickInterceptedException): break except Exception as e: logger.error(f"加载二级评论时出错: {str(e)}") break except Exception as e: logger.error(f"处理评论 {i + 1} 时出错: {str(e)}") logger.info(f"评论加载完成,耗时 {time.time() - start_time:.2f} 秒") return driver.page_source def extract_comment_content(block): """提取评论内容,包括表情""" try: content_parts = [] seen_text = set() if block: # 提取文本内容 for child in block.children: if child.name == 'span': text = child.get_text(strip=True) if text and text not in seen_text: content_parts.append(text) seen_text.add(text) elif child.name == 'img' and child.get('alt'): alt = child.get('alt', '') if alt: content_parts.append(alt) seen_text.add(alt) # 备用提取:直接查找所有图片的表情 if not content_parts: for img in block.find_all('img'): alt = img.get('alt', '') if alt and alt not in seen_text: content_parts.append(alt) seen_text.add(alt) content = ''.join(content_parts) if content_parts else "无内容" return content except Exception as e: logger.error(f"提取评论内容失败: {str(e)}") return "无内容" def parse_comments(html, video_url): """解析评论""" logger.info(f"开始解析评论: {video_url}") start_time = time.time() soup = BeautifulSoup(html, "html.parser") comment_items = soup.find_all("div", {"data-e2e": "comment-item"}) results = [] primary_index = 0 processed_comments = set() for block in tqdm(comment_items, desc="解析评论"): try: is_primary = not block.find_parent("div", class_=re.compile(r'replyContainer')) if is_primary: primary_index += 1 a_tag = block.select_one("a[href*='/user/'] span.j5WZzJdp span span span") or block.select_one("a[href*='/user/'] span") user = a_tag.get_text(strip=True) if a_tag else "未知用户" user = re.sub(r'[^\w\s]', '', user) cont_div = block.select_one("div.LvAtyU_f") content = extract_comment_content(cont_div) like_span = block.select_one("span[data-e2e='comment-like-count']") or block.select_one("p.wiQmZrKV span") likes = like_span.get_text(strip=True) if like_span else "0" secondary_comments = block.select("div.cKvms_3E.replyContainer div[data-e2e='comment-item']") secondary_count = len(secondary_comments) comment_key = f"primary_{primary_index}_{user}_{content}" if comment_key not in processed_comments: primary_comment = { "一级评论_序号": primary_index, "一级评论_账号": user, "一级评论_评论内容": content, "一级评论_赞数量": likes, "一级评论_裂开数量": "0", "一级评论_子评论数量": secondary_count, "二级评论_序号": "", "二级评论_账号": "", "二级评论_评论内容": "", "二级评论_点赞数量": "", "二级评论_裂开数量": "" } results.append(primary_comment) processed_comments.add(comment_key) if secondary_comments: secondary_index = 0 for secondary in secondary_comments: secondary_index += 1 sec_a_tag = secondary.select_one("a[href*='/user/'] span.j5WZzJdp span span span") or secondary.select_one("a[href*='/user/'] span") sec_user = sec_a_tag.get_text(strip=True) if sec_a_tag else "未知用户" sec_user = re.sub(r'[^\w\s]', '', sec_user) sec_cont_div = secondary.select_one("div.LvAtyU_f") sec_content = extract_comment_content(sec_cont_div) sec_like_span = secondary.select_one("span[data-e2e='comment-like-count']") or secondary.select_one("p.wiQmZrKV span") sec_likes = sec_like_span.get_text(strip=True) if sec_like_span else "0" comment_key = f"secondary_{primary_index}_{secondary_index}_{sec_user}_{sec_content}" if comment_key not in processed_comments: secondary_comment = { "一级评论_序号": primary_index, "一级评论_账号": "", "一级评论_评论内容": "", "一级评论_赞数量": "", "一级评论_裂开数量": "", "一级评论_子评论数量": "", "二级评论_序号": secondary_index, "二级评论_账号": sec_user, "二级评论_评论内容": sec_content, "二级评论_点赞数量": sec_likes, "二级评论_裂开数量": "0" } results.append(secondary_comment) processed_comments.add(comment_key) except Exception as e: logger.error(f"解析第 {primary_index + 1} 条一级评论时出错: {str(e)}") logger.info(f"解析完成,共解析 {len(results)} 条评论,耗时 {time.time() - start_time:.2f} 秒") return results def main(): """主逻辑""" logger.info("开始取任务") start_time = time.time() video_urls = read_video_urls("test.csv") all_comments = [] for i, url in enumerate(tqdm(video_urls, desc="处理视频")): logger.info(f"[{i + 1}/{len(video_urls)}] 正在抓取视频: {url}") start_video_time = time.time() driver = create_driver() try: load_cookies(driver, "cookies.json") html = load_all_comments(driver, url) comments = parse_comments(html, url) logger.info(f"[{i + 1}/{len(video_urls)}] 抓取到 {len(comments)} 条评论,耗时 {time.time() - start_video_time:.2f} 秒") all_comments.extend(comments) # 保存中间结果 with open(f"intermediate_results_{i + 1}.csv", "w", encoding="utf-8-sig", newline="") as f: writer = csv.DictWriter(f, fieldnames=[ "一级评论_序号", "一级评论_账号", "一级评论_评论内容", "一级评论_赞数量", "一级评论_裂开数量", "一级评论_子评论数量", "二级评论_序号", "二级评论_账号", "二级评论_评论内容", "二级评论_点赞数量", "二级评论_裂开数量" ]) writer.writeheader() writer.writerows(comments) logger.info(f"[{i + 1}/{len(video_urls)}] 中间结果已保存至 intermediate_results_{i + 1}.csv") except Exception as e: logger.error(f"[{i + 1}/{len(video_urls)}] 处理视频失败: {url} - {str(e)}") finally: driver.quit() # 保存最终结果 keys = [ "一级评论_序号", "一级评论_账号", "一级评论_评论内容", "一级评论_赞数量", "一级评论_裂开数量", "一级评论_子评论数量", "二级评论_序号", "二级评论_账号", "二级评论_评论内容", "二级评论_点赞数量", "二级评论_裂开数量" ] with open("comments.csv", "w", encoding="utf-8-sig", newline="") as f: writer = csv.DictWriter(f, fieldnames=keys) writer.writeheader() writer.writerows(all_comments) logger.info(f"最终结果已保存至 comments1.csv,共 {len(all_comments)} 条评论,耗时 {time.time() - start_time:.2f} 秒") logger.info(f"取任务完成,耗时 {time.time() - start_time:.2f} 秒") if __name__ == "__main__": main() y_conda_envs\Scrape\python.exe F:\Scrape\Douyin_scrape\二级存表情.py 2025-06-27 19:35:59,432 [INFO] 开始取任务 2025-06-27 19:35:59,432 [INFO] 正在读取视频链接文件: test.csv 2025-06-27 19:35:59,432 [INFO] 成功加载 1 个视频链接 处理视频: 0%| | 0/1 [00:00<?, ?it/s]2025-06-27 19:35:59,438 [INFO] [1/1] 正在抓取视频: https://v.douyin.com/6PfJULVgFxE 2025-06-27 19:35:59,438 [INFO] 正在创建 Chrome WebDriver 实例 2025-06-27 19:36:01,019 [INFO] WebDriver 创建成功 2025-06-27 19:36:01,019 [INFO] 正在加载 cookies 文件: cookies.json 2025-06-27 19:36:26,258 [INFO] cookies 加载成功 2025-06-27 19:36:26,258 [INFO] 开始加载评论: https://v.douyin.com/6PfJULVgFxE 2025-06-27 19:36:30,962 [INFO] 开始加载一级评论 2025-06-27 19:36:42,523 [INFO] 迭代 0: 已加载 6 条一级评论 2025-06-27 19:37:56,001 [INFO] 迭代 1: 已加载 6 条一级评论 2025-06-27 19:39:09,790 [INFO] 迭代 2: 已加载 6 条一级评论 2025-06-27 19:40:23,472 [INFO] 迭代 3: 已加载 6 条一级评论 为什么程序在第一次迭代后 就没有取更多评论了 请检查是否滚动页面 取所有一级评论 帮我完善代码
最新发布
06-28
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值