import csv
import re
import time
from typing import Optional, Dict, List, Set
import requests
from bs4 import BeautifulSoup
from langdetect import detect
# ==== Selenium imports ====
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.common.exceptions import WebDriverException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# ==== Config(已删除SKIP_KEYWORDS)====
INPUT_FILE = "top100000_domains.csv"
OUTPUT_FILE = "top100000_domains_with_text.csv"
# 若你的 chromedriver 不在 PATH 中,填入绝对路径,例如:
# CHROMEDRIVER_PATH = r"E:\drivers\chromedriver.exe"
CHROMEDRIVER_PATH: Optional[str] = None # None 表示使用系统 PATH 中的 chromedriver
REQUEST_TIMEOUT = 12
SELENIUM_PAGELOAD_TIMEOUT = 20
SELENIUM_RENDER_WAIT = 3 # 页面 readyState 完成后额外等待秒数,给 JS 更多时间
PAUSE_BETWEEN_REQUESTS = 0.4
MAX_CHARS = 15000 # 限制保存的文本长度,避免极大页面带来 IO 压力
MIN_VALID_LENGTH = 100 # 小于该长度视为无效文本,触发 Selenium fallback
COMMON_HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/118.0 Safari/537.36"
),
"Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "keep-alive",
}
# 多语言无用短语表(可扩展)
USELESS_PHRASES_DICT = {
'zh': {"首页", "主页", "返回首页", "联系我们", "关于我们", "隐私政策",
"数据保护", "版权声明",
"下一页", "上一页", "更多", "返回顶部", "订阅", "分享", "帮助中心",
"服务条款", "免责声明", "广告服务", "反馈建议"},
'en': {"home", "homepage", "contact", "contact us", "about", "about us",
"privacy", "privacy policy", "terms of service", "terms and conditions",
"sitemap", "navigation", "register", "login", "sign in", "sign up",
"next", "previous", "more", "back to top", "subscribe", "share", "help",
"disclaimer", "advertising", "feedback"},
'es': {"inicio", "página principal", "contáctenos", "contacto", "sobre nosotros",
"política de privacidad", "mapa del sitio", "navegación", "registrarse",
"iniciar sesión", "siguiente", "anterior", "más", "volver arriba",
"suscribirse", "compartir", "ayuda", "aviso legal", "publicidad"},
'fr': {"accueil", "page d'accueil", "contact", "contactez-nous", "à propos",
"politique de confidentialité", "plan du site", "navigation", "s'inscrire",
"connexion", "suivant", "précédent", "plus", "haut de page", "s'abonner",
"partager", "aide", "mention légale", "publicité"},
'de': {"startseite", "kontakt", "über uns", "datenschutz", "datenschutzerklärung",
"seitenverzeichnis", "navigation", "registrieren", "anmelden", "weiter",
"zurück", "mehr", "nach oben", "abonnieren", "teilen", "hilfe",
"haftungsausschluss", "werbung"}
}
# ==== 语言检测和文本过滤 ====
def detect_lang(html_text: str) -> str:
"""优先用 <html lang=""> 检测语言,否则用 langdetect"""
soup = BeautifulSoup(html_text, "html.parser")
lang_attr = soup.html.get("lang") if soup.html else None
if lang_attr:
return lang_attr.split("-")[0].lower()
try:
return detect(soup.get_text())
except:
return 'en' # 默认英文
def is_useful_text(text: str, lang: str = 'en') -> bool:
"""判断文本是否有用"""
text = text.strip()
if len(text) <= 2:
return False
if re.fullmatch(r"[\d\W_]+", text): # 全是数字/符号
return False
useless_set = USELESS_PHRASES_DICT.get(lang, set())
if text.lower() in useless_set:
return False
return True
def extract_useful_text_from_soup(soup: BeautifulSoup, lang: str) -> str:
"""从BeautifulSoup对象中提取有用文本"""
# 提取不同类型的内容
data = {
"title": "",
"meta_description": "",
"meta_keywords": "",
"headings": [],
"paragraphs": [],
"alt_texts": [],
"link_texts": [],
"table_contents": [],
"list_items": []
}
# 标题
if soup.title and is_useful_text(soup.title.string, lang):
data["title"] = soup.title.string.strip()
# meta 描述 & 关键词
desc_tag = soup.find("meta", attrs={"name": "description"})
if desc_tag and desc_tag.get("content") and is_useful_text(desc_tag["content"], lang):
data["meta_description"] = desc_tag["content"].strip()
keywords_tag = soup.find("meta", attrs={"name": "keywords"})
if keywords_tag and keywords_tag.get("content") and is_useful_text(keywords_tag["content"], lang):
data["meta_keywords"] = keywords_tag["content"].strip()
# H 标签
for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]:
for h in soup.find_all(tag):
txt = re.sub(r"\s+", " ", h.get_text(strip=True))
if is_useful_text(txt, lang):
data["headings"].append(txt)
# 段落
for p in soup.find_all("p"):
txt = re.sub(r"\s+", " ", p.get_text(strip=True))
if is_useful_text(txt, lang):
data["paragraphs"].append(txt)
# 图片 alt
for img in soup.find_all("img"):
alt = img.get("alt")
if alt and is_useful_text(alt, lang):
data["alt_texts"].append(alt.strip())
# 链接文本
for a in soup.find_all("a"):
txt = re.sub(r"\s+", " ", a.get_text(strip=True))
if is_useful_text(txt, lang):
data["link_texts"].append(txt)
# 表格内容
for table in soup.find_all("table"):
for row in table.find_all("tr"):
cells = [re.sub(r"\s+", " ", c.get_text(strip=True)) for c in row.find_all(["td", "th"])]
cells = [c for c in cells if is_useful_text(c, lang)]
if cells:
data["table_contents"].append(" | ".join(cells))
# 列表项
for li in soup.find_all("li"):
txt = re.sub(r"\s+", " ", li.get_text(strip=True))
if is_useful_text(txt, lang):
data["list_items"].append(txt)
# 合并所有有用文本
combined_parts = [
data["title"], data["meta_description"], data["meta_keywords"],
*data["headings"], *data["paragraphs"], *data["alt_texts"],
*data["link_texts"], *data["table_contents"], *data["list_items"]
]
combined_text = "\n".join(filter(None, combined_parts))
return combined_text
# ==== Text cleaning for BERT ====
def clean_text_for_bert(text: str) -> str:
# 去除多余空白与特殊符号
text = re.sub(r"\s+", " ", text)
text = re.sub(r"[^\w\s.,!?;:()\[\]\'\"-]", "", text)
text = text.strip()
# 限长
if len(text) > MAX_CHARS:
text = text[:MAX_CHARS]
return text
# ==== Requests-first fetch(已删除SKIP_KEYWORDS判断)====
def fetch_with_requests(url: str) -> str:
try:
if not url.startswith(("http://", "https://")):
url = "https://" + url # 强制 HTTPS,提高成功率
# (已删除)原SKIP_KEYWORDS过滤逻辑:if any(k in url for k in SKIP_KEYWORDS): return "[SKIPPED...]"
session = requests.Session()
resp = session.get(url, headers=COMMON_HEADERS, timeout=REQUEST_TIMEOUT)
# 关键改进:使用自动检测的编码来避免乱码
resp.encoding = resp.apparent_encoding
resp.raise_for_status()
# 检测页面语言
lang = detect_lang(resp.text)
soup = BeautifulSoup(resp.text, "html.parser")
# 删除无关标签
for tag in soup(["script", "style", "noscript", "iframe", "header", "footer", "nav", "form", "aside"]):
tag.decompose()
# 提取有用文本
text = extract_useful_text_from_soup(soup, lang)
text = clean_text_for_bert(text)
return text if text else "[NO TEXT]"
except Exception as e:
return f"[ERROR_REQUESTS] {e}"
# ==== Selenium driver init ====
def init_selenium_driver() -> Optional[webdriver.Chrome]:
try:
options = Options()
options.add_argument("--headless=new") # Chrome 109+ 推荐
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_argument("--window-size=1920,1080")
# 减少资源加载(可选):禁用图片、字体等,提升速度
prefs = {
"profile.managed_default_content_settings.images": 2,
"profile.managed_default_content_settings.stylesheets": 1,
"profile.managed_default_content_settings.fonts": 1,
"profile.managed_default_content_settings.popups": 2,
"profile.managed_default_content_settings.notifications": 2,
}
options.add_experimental_option("prefs", prefs)
options.add_argument(f"user-agent={COMMON_HEADERS['User-Agent']}")
if CHROMEDRIVER_PATH:
service = Service(CHROMEDRIVER_PATH)
else:
service = Service() # 使用系统 PATH
driver = webdriver.Chrome(service=service, options=options)
driver.set_page_load_timeout(SELENIUM_PAGELOAD_TIMEOUT)
return driver
except WebDriverException as e:
print(f"[WARN] Selenium 初始化失败: {e}")
return None
# ==== Selenium render fetch(已删除SKIP_KEYWORDS判断)====
def fetch_with_selenium(url: str, driver: webdriver.Chrome) -> str:
try:
if not url.startswith(("http://", "https://")):
url = "https://" + url
# (已删除)原SKIP_KEYWORDS过滤逻辑:if any(k in url for k in SKIP_KEYWORDS): return "[SKIPPED...]"
driver.get(url)
# 等待文档就绪
WebDriverWait(driver, SELENIUM_PAGELOAD_TIMEOUT).until(
lambda d: d.execute_script("return document.readyState") == "complete"
)
# 额外等待 JS 渲染
time.sleep(SELENIUM_RENDER_WAIT)
# 获取页面内容和编码
html = driver.page_source
lang = detect_lang(html)
soup = BeautifulSoup(html, "html.parser")
# 删除无关标签
for tag in soup(["script", "style", "noscript", "iframe", "header", "footer", "nav", "form", "aside"]):
tag.decompose()
# 提取有用文本
text = extract_useful_text_from_soup(soup, lang)
text = clean_text_for_bert(text)
return text if text else "[NO TEXT]"
except TimeoutException as e:
return f"[ERROR_SELENIUM_TIMEOUT] {e}"
except Exception as e:
return f"[ERROR_SELENIUM] {e}"
# ==== Unified fetch with fallback ====
def fetch_text(url: str, driver: Optional[webdriver.Chrome]) -> str:
# 先尝试 requests
text = fetch_with_requests(url)
if text.startswith("[ERROR_REQUESTS]") or text == "[NO TEXT]" or len(text) < MIN_VALID_LENGTH:
# 尝试 Selenium 渲染
if driver is None:
# 无驱动时,直接返回 requests 的结果或错误
return text
text2 = fetch_with_selenium(url, driver)
# 如果 Selenium 也失败,返回更有信息量的结果
if text2.startswith("[ERROR_SELENIUM"):
# 将两个错误拼接,便于审计
return f"{text} | {text2}"
# 如果 Selenium 成功或者有更长文本,优先返回 Selenium
if text2 != "[NO TEXT]" and len(text2) >= len(text):
return text2
return text
# ==== Process CSV with real-time saving ====
def process_csv():
driver = init_selenium_driver() # 可能返回 None
try:
with open(INPUT_FILE, newline="", encoding="utf-8") as infile, \
open(OUTPUT_FILE, "a", newline="", encoding="utf-8") as outfile:
reader = csv.reader(infile)
writer = csv.writer(outfile)
# 若是新文件,写表头
if outfile.tell() == 0:
writer.writerow(["id", "url", "text"])
# 跳过输入表头
header = next(reader, None)
for row in reader:
if len(row) < 2:
continue
id_, url = row[0], row[1]
print(f"正在爬取: {url}")
result_text = fetch_text(url, driver)
writer.writerow([id_, url, result_text])
outfile.flush() # 实时落盘
time.sleep(PAUSE_BETWEEN_REQUESTS)
finally:
if driver is not None:
try:
driver.quit()
except Exception:
pass
if __name__ == "__main__":
process_csv()
总结介绍这个代码