import csv
import re
import time
import random
import threading
from queue import Queue, Empty
import requests
from bs4 import BeautifulSoup
from langdetect import detect
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import WebDriverException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
# ============================== 配置(核心修改:延长Selenium超时+新增重试次数) ==============================
INPUT_FILE = "top10milliondomains.csv"
OUTPUT_FILE = "层次文本特征.csv"
CHROMEDRIVER_PATH = None
OUTPUT_COLUMNS = ["id", "url", "title", "meta_description", "meta_keywords", "content"]
# 新增:爬取范围控制(1-based行号,表头为第1行,数据从第2行开始)
START_ROW = 273700 # 从第几行开始爬取(含此行,例:2=第一个数据行,中断后设为上次结束行号+1)
MAX_CRAWL_ROWS = 1000000 # 最多爬取行数(0或负数表示爬取到最后一行)
# 原有爬取配置(核心修改:延长Selenium相关超时时间)
REQUEST_TIMEOUT = 12
SELENIUM_PAGELOAD_TIMEOUT = 30 # 1. 页面加载超时:从200秒→300秒(应对网速慢)
SELENIUM_RENDER_WAIT = 20 # 2. 渲染等待:从10秒→20秒(确保动态内容加载)
SELENIUM_RETRY_TIMES = 2 # 3. 新增:超时后重试次数(避免临时网络波动)
PAUSE_BETWEEN_REQUESTS = 0.4
MAX_CHARS_PER_COL = 2000 # 其他字段(标题、meta)长度限制不变
MIN_VALID_LENGTH = 5
# 多线程配置
THREAD_NUM = 10
TASK_QUEUE = Queue(maxsize=THREAD_NUM * 20)
WRITE_LOCK = threading.Lock()
# 公共头(优先UTF-8编码)
COMMON_HEADERS = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0 Safari/537.36",
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Charset": "UTF-8,gbk,gb2312",
"Connection": "keep-alive",
}
# 无用短语表(不变)
USELESS_PHRASES_DICT = {
'zh': {"首页", "主页", "返回首页", "联系我们", "关于我们", "隐私政策", "数据保护", "版权声明", "下一页", "上一页", "更多", "返回顶部", "订阅", "分享", "帮助中心", "服务条款", "免责声明", "广告服务", "反馈建议"},
'en': {"home", "homepage", "contact", "contact us", "about", "about us", "privacy", "privacy policy", "terms of service", "terms and conditions", "sitemap", "navigation", "register", "login", "sign in", "sign up", "next", "previous", "more", "back to top", "subscribe", "share", "help", "disclaimer", "advertising", "feedback"},
'es': {"inicio", "página principal", "contáctenos", "contacto", "sobre nosotros", "política de privacidad", "mapa del sitio", "navegación", "registrarse", "iniciar sesión", "siguiente", "anterior", "más", "volver arriba", "suscribirse", "compartir", "ayuda", "aviso legal", "publicidad"},
'fr': {"accueil", "page d'accueil", "contact", "contactez-nous", "à propos", "politique de confidentialité", "plan du site", "navigation", "s'inscrire", "connexion", "suivant", "précédent", "plus", "haut de page", "s'abonner", "partager", "aide", "mention légale", "publicité"},
'de': {"startseite", "kontakt", "über uns", "datenschutz", "datenschutzerklärung", "seitenverzeichnis", "navigation", "registrieren", "anmelden", "weiter", "zurück", "mehr", "nach oben", "abonnieren", "teilen", "hilfe", "haftungsausschluss", "werbung"}
}
# ============================== 工具函数(不变) ==============================
def detect_lang(html_text):
"""语言检测(优化中文准确性)"""
soup = BeautifulSoup(html_text, "html.parser")
lang_attr = soup.html.get("lang") if soup.html else None
if lang_attr and "zh" in lang_attr:
return "zh"
try:
lang = detect(soup.get_text())
return "zh" if lang in ["zh-cn", "zh-tw", "zh"] else lang
except:
return 'en'
def is_useful_text(text, lang='en'):
"""文本有效性判断"""
text = text.strip()
if len(text) <= 2 or re.fullmatch(r"[\d\W_]+", text):
return False
return text.lower() not in USELESS_PHRASES_DICT.get(lang, set())
def extract_structured_text(soup, lang):
"""提取结构化文本(核心修改:删除[LINK]/[IMG]/[Hx]等标记,保留纯文本)"""
result = {
"title": "[NO DATA]",
"meta_description": "[NO DATA]",
"meta_keywords": "[NO DATA]",
"content": "[NO DATA]"
}
# 1. 标题(处理编码残留)
if soup.title and soup.title.string:
title = soup.title.string.strip().replace("\u00a0", " ")
if is_useful_text(title, lang):
result["title"] = title[:MAX_CHARS_PER_COL]
# 2. Meta描述
desc_tag = soup.find("meta", attrs={"name": "description"})
if desc_tag and desc_tag.get("content"):
meta_desc = desc_tag["content"].strip().replace("\u00a0", " ")
if is_useful_text(meta_desc, lang):
result["meta_description"] = meta_desc[:MAX_CHARS_PER_COL]
# 3. Meta关键词
keywords_tag = soup.find("meta", attrs={"name": "keywords"})
if keywords_tag and keywords_tag.get("content"):
meta_key = keywords_tag["content"].strip().replace("\u00a0", " ")
if is_useful_text(meta_key, lang):
result["meta_keywords"] = meta_key[:MAX_CHARS_PER_COL]
# 4. 正文(删除所有类型标记,直接保留纯文本)
content_parts = []
# 4.1 H标签(无前缀)
for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
for h in soup.find_all(tag):
txt = re.sub(r"\s+", " ", h.get_text(strip=True).replace("\u00a0", " "))
if is_useful_text(txt, lang):
content_parts.append(txt)
# 4.2 段落文本(无前缀)
for p in soup.find_all("p"):
txt = re.sub(r"\s+", " ", p.get_text(strip=True).replace("\u00a0", " "))
if is_useful_text(txt, lang) and len(txt) > 5:
content_parts.append(txt)
# 4.3 补充内容(图片ALT+链接文本,无前缀)
extra_parts = []
for img in soup.find_all("img"):
alt = img.get("alt")
if alt:
alt_clean = alt.strip().replace("\u00a0", " ")
if is_useful_text(alt_clean, lang):
extra_parts.append(alt_clean)
for a in soup.find_all("a"):
txt = re.sub(r"\s+", " ", a.get_text(strip=True).replace("\u00a0", " "))
if is_useful_text(txt, lang) and len(txt) > 3:
extra_parts.append(txt)
# 合并正文(暂用原有长度限制,后续截断为500)
if content_parts or extra_parts:
full_content = "\n".join(content_parts + extra_parts)[:MAX_CHARS_PER_COL]
if len(full_content.replace("[NO DATA]", "").strip()) >= MIN_VALID_LENGTH:
result["content"] = full_content
return result
def clean_text(text):
"""文本清洗(保留中文/英文/数字/常见标点,不变)"""
text = re.sub(r"[^\u4e00-\u9fa5a-zA-Z0-9\s.,!?;:()\[\]\'\"-]", "", text)
text = re.sub(r"\s+", " ", text).strip()
return text[:MAX_CHARS_PER_COL] if len(text) > MAX_CHARS_PER_COL else text
# ============================== 爬取函数(核心修改:Selenium超时重试) ==============================
def fetch_with_requests(url):
"""Requests爬取(优化编码)"""
try:
if not url.startswith(("http://", "https://")):
url = "https://" + url
resp = requests.get(url, headers=COMMON_HEADERS, timeout=REQUEST_TIMEOUT)
# 优化编码识别(解决中文乱码)
if resp.encoding and resp.encoding.lower() in ["utf-8", "gbk", "gb2312"]:
encoding = resp.encoding
else:
encoding = resp.apparent_encoding
if encoding.lower() in ["iso-8859-1", "latin-1"]:
encoding = "UTF-8"
try:
resp.text.encode("latin-1").decode("utf-8")
except UnicodeDecodeError:
encoding = "GBK"
resp.encoding = encoding
html_text = resp.text
soup = BeautifulSoup(html_text, "html.parser", from_encoding=encoding)
# 删除无用标签
for tag in ["script", "style", "noscript", "iframe", "header", "footer", "nav", "form", "aside"]:
for t in soup.find_all(tag):
t.decompose()
lang = detect_lang(html_text)
structured_text = extract_structured_text(soup, lang)
return {
"title": clean_text(structured_text["title"]),
"meta_description": clean_text(structured_text["meta_description"]),
"meta_keywords": clean_text(structured_text["meta_keywords"]),
"content": clean_text(structured_text["content"])
}
except UnicodeDecodeError as e:
err_msg = f"编码解码失败:{str(e)[:80]}"
return {
"title": f"[ERROR_ENCODING] {err_msg}",
"meta_description": f"[ERROR_ENCODING] {err_msg}",
"meta_keywords": f"[ERROR_ENCODING] {err_msg}",
"content": f"[ERROR_ENCODING] {err_msg}"
}
except Exception as e:
err_msg = str(e)[:100].replace(",", ",").replace("\n", " ")
return {
"title": f"[ERROR_REQUESTS] {err_msg}",
"meta_description": f"[ERROR_REQUESTS] {err_msg}",
"meta_keywords": f"[ERROR_REQUESTS] {err_msg}",
"content": f"[ERROR_REQUESTS] {err_msg}"
}
def init_selenium_driver():
"""初始化Selenium驱动(中文配置,不变)"""
try:
options = Options()
options.add_argument("--headless=new")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--window-size=1920,1080")
options.add_argument("--lang=zh-CN") # 中文渲染
options.add_experimental_option("prefs", {
"profile.managed_default_content_settings.images": 2, # 禁用图片加载(加快速度)
"profile.managed_default_content_settings.stylesheets": 1,
"profile.managed_default_content_settings.fonts": 1,
"profile.managed_default_content_settings.popups": 2,
"profile.managed_default_content_settings.notifications": 2
})
options.add_argument(f"user-agent={COMMON_HEADERS['User-Agent']}")
service = Service(CHROMEDRIVER_PATH) if CHROMEDRIVER_PATH else Service()
driver = webdriver.Chrome(service=service, options=options)
driver.set_page_load_timeout(SELENIUM_PAGELOAD_TIMEOUT) # 应用延长后的加载超时
return driver
except WebDriverException as e:
print(f"[WARN] Selenium初始化失败: {e}")
return None
def fetch_with_selenium(url, driver):
"""Selenium爬取(核心修改:新增超时重试机制)"""
retry_count = 0 # 当前重试次数
while retry_count <= SELENIUM_RETRY_TIMES:
try:
if not url.startswith(("http://", "https://")):
url = "https://" + url
driver.get(url)
# 优化:WebDriverWait超时与页面加载超时一致,避免提前判定超时
WebDriverWait(driver, SELENIUM_PAGELOAD_TIMEOUT).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
time.sleep(SELENIUM_RENDER_WAIT) # 应用延长后的渲染等待
# UTF-8编码获取页面源码
html_text = driver.page_source.encode("utf-8", errors="replace").decode("utf-8")
soup = BeautifulSoup(html_text, "html.parser", from_encoding="UTF-8")
# 删除无用标签
for tag in ["script", "style", "noscript", "iframe", "header", "footer", "nav", "form", "aside"]:
for t in soup.find_all(tag):
t.decompose()
lang = detect_lang(html_text)
structured_text = extract_structured_text(soup, lang)
return {
"title": clean_text(structured_text["title"]),
"meta_description": clean_text(structured_text["meta_description"]),
"meta_keywords": clean_text(structured_text["meta_keywords"]),
"content": clean_text(structured_text["content"])
}
except TimeoutException as e:
retry_count += 1
if retry_count > SELENIUM_RETRY_TIMES:
# 重试次数用尽,返回超时错误
err_msg = f"Timeout(已重试{SELENIUM_RETRY_TIMES}次): {str(e)[:80]}".replace(",", ",").replace("\n", " ")
return {
"title": f"[ERROR_SEL_TIMEOUT] {err_msg}",
"meta_description": f"[ERROR_SEL_TIMEOUT] {err_msg}",
"meta_keywords": f"[ERROR_SEL_TIMEOUT] {err_msg}",
"content": f"[ERROR_SEL_TIMEOUT] {err_msg}"
}
# 未用尽重试次数,等待后重试
wait_time = 3 + retry_count * 2 # 重试间隔递增(3s→5s→7s...)
print(f"[WARN] Selenium超时,将在{wait_time}秒后重试(第{retry_count}/{SELENIUM_RETRY_TIMES}次):{url}")
time.sleep(wait_time)
except Exception as e:
err_msg = str(e)[:80].replace(",", ",").replace("\n", " ")
return {
"title": f"[ERROR_SEL] {err_msg}",
"meta_description": f"[ERROR_SEL] {err_msg}",
"meta_keywords": f"[ERROR_SEL] {err_msg}",
"content": f"[ERROR_SEL] {err_msg}"
}
def fetch_structured_text(url, driver):
"""统一爬取入口(不变)"""
req_result = fetch_with_requests(url)
req_valid = not req_result["title"].startswith("[ERROR_REQUESTS]") and \
req_result["title"] != "[NO DATA]" and \
len(req_result["content"].replace("[NO DATA]", "").strip()) >= MIN_VALID_LENGTH
if not req_valid and driver:
sel_result = fetch_with_selenium(url, driver)
return sel_result
return req_result
# ============================== 线程工作函数(不变:content截断为500字符) ==============================
def thread_worker():
driver = init_selenium_driver()
try:
while True:
try:
id_, url = TASK_QUEUE.get(block=True, timeout=5)
except Empty:
print(f"[INFO] 线程{threading.current_thread().name}:任务空,退出")
break
print(f"[INFO] 线程{threading.current_thread().name}:爬取 {url}")
structured_text = fetch_structured_text(url, driver)
# 核心修改1:content字段仅保留前500字符(适配Bert 512token限制)
truncated_content = structured_text["content"][:500]
with WRITE_LOCK:
with open(OUTPUT_FILE, "a", newline="", encoding="utf-8-sig") as f:
writer = csv.writer(f)
writer.writerow([
id_,
url,
structured_text["title"],
structured_text["meta_description"],
structured_text["meta_keywords"],
truncated_content # 写入截断后的content
])
TASK_QUEUE.task_done()
time.sleep(PAUSE_BETWEEN_REQUESTS + random.uniform(0, 0.2))
except Exception as e:
print(f"[ERROR] 线程{threading.current_thread().name}:出错 {str(e)[:200]}")
finally:
if driver:
try:
driver.quit()
print(f"[INFO] 线程{threading.current_thread().name}:关闭Driver")
except Exception as e:
print(f"[WARN] 线程{threading.current_thread().name}:关闭Driver出错 {e}")
# ============================== 主函数(不变:按起始行+总行数加载任务) ==============================
def main():
# 初始化输出文件(不变)
try:
with open(OUTPUT_FILE, "r", encoding="utf-8-sig") as f:
first_line = f.readline().strip()
if not first_line or first_line != ",".join(OUTPUT_COLUMNS):
with open(OUTPUT_FILE, "w", newline="", encoding="utf-8-sig") as f2:
csv.writer(f2).writerow(OUTPUT_COLUMNS)
except FileNotFoundError:
with open(OUTPUT_FILE, "w", newline="", encoding="utf-8-sig") as f:
csv.writer(f).writerow(OUTPUT_COLUMNS)
# 启动线程(不变)
print(f"[INFO] 启动{THREAD_NUM}个爬取线程...")
threads = []
for i in range(THREAD_NUM):
t = threading.Thread(target=thread_worker, name=f"Worker-{i+1}")
t.daemon = True
t.start()
threads.append(t)
print(f"[INFO] 线程{t.name}启动")
# 核心修改2:按起始行(START_ROW)和总爬取行数(MAX_CRAWL_ROWS)加载任务
print(f"[INFO] 读取{INPUT_FILE}加载任务...")
print(f"[INFO] 爬取配置:起始行={START_ROW},最大爬取行数={MAX_CRAWL_ROWS if MAX_CRAWL_ROWS>0 else '无限制'}")
print(f"[INFO] Selenium配置:加载超时={SELENIUM_PAGELOAD_TIMEOUT}s,渲染等待={SELENIUM_RENDER_WAIT}s,超时重试={SELENIUM_RETRY_TIMES}次")
task_count = 0 # 已加载的有效任务数
current_row = 0 # 当前读取的行号(1-based,表头为第1行)
with open(INPUT_FILE, newline="", encoding="utf-8") as f:
reader = csv.reader(f)
for row in reader:
current_row += 1 # 每行递增行号
# 1. 跳过表头(第1行)
if current_row == 1:
continue
# 2. 跳过起始行之前的行
if current_row < START_ROW:
continue
# 3. 达到最大爬取行数时停止加载
if MAX_CRAWL_ROWS > 0 and task_count >= MAX_CRAWL_ROWS:
print(f"[INFO] 已达到最大爬取行数{MAX_CRAWL_ROWS},停止加载任务")
break
# 4. 过滤不完整的行(至少包含id和url)
if len(row) < 2:
print(f"[WARN] 第{current_row}行数据不完整(需至少2列),跳过:{row}")
continue
# 5. 提取id和url,添加到任务队列
id_, url = row[0], row[1]
TASK_QUEUE.put((id_, url), block=True)
task_count += 1
# 6. 每加载100个任务打印一次日志
if task_count % 100 == 0:
print(f"[INFO] 已加载{task_count}个任务,队列剩余:{TASK_QUEUE.qsize()},当前行号:{current_row}")
# 打印任务加载结果
if task_count == 0:
print(f"[WARN] 未加载到任何任务!请检查:1.起始行{START_ROW}是否超过数据总行数;2.输入文件是否有有效数据")
else:
print(f"[INFO] 任务加载完成!共加载{task_count}个任务(当前行号:{current_row})")
# 等待所有任务完成(不变)
TASK_QUEUE.join()
print(f"[INFO] 所有任务完成!")
# 等待线程退出(不变)
for t in threads:
t.join(timeout=10)
print(f"[INFO] 所有线程退出,程序结束")
if __name__ == "__main__":
main()
梳理这个代码的爬取流程