img.readyState!= complete 权限问题 也是一个无用题

本文尝试在客户端进行图片上传前的尺寸及大小验证,但遇到权限问题导致无法完成验证过程。文章分享了一段JavaScript代码,旨在说明如何加载并检查图片文件的基本属性。

今天 处一个问,我想在客户端验证上传图片的大小,尺寸,结果呢,没成功,

结果发现这个问题,就是img.readyState!= complete  权限问题

服务器端是没有权限访问客户的资源的。也算是收获,哈哈

代码贴出来,大家看,放到tomcat下就不行了,阿哈,阿哈 o(╯□╰)o

 

 

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN">
<HTML>
<HEAD>
<TITLE> New Document </TITLE>
</HEAD>
<script language="javascript">
<!--
//检查插入是否为图片
var img=null;
function chkimg(inp){
 if(img)img.removeNode(true);
 img=document.createElement("img");
 img.style.position="absolute";
 img.style.visibility="hidden";
 img.attachEvent("onreadystatechange",isimg);
 img.attachEvent("onerror",notimg);
 img.src=inp;
}
function notimg(){
 alert("您插入的不是图片,请重新选择插入");
}
function isimg(){
 show.insertAdjacentElement("BeforeEnd",img);
 show1.innerHTML = "图片大小" + img.fileSize/1024 +"K<br />图片宽度"+ img.offsetWidth +"<br />图片高度"+ img.offsetHeight;
}
// -->
</script>
<BODY>
<div id="show"></div>
<div id="show1"></div>
<input type="file" name="" onpropertychange="chkimg(this.value)"/>
</BODY>
</HTML>

 

import csv import re import time from typing import Optional, Dict, List, Set import requests from bs4 import BeautifulSoup from langdetect import detect # ==== Selenium imports ==== from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.common.exceptions import WebDriverException, TimeoutException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC # ==== Config(已删除SKIP_KEYWORDS)==== INPUT_FILE = "top100000_domains.csv" OUTPUT_FILE = "top100000_domains_with_text.csv" # 若你的 chromedriver 不在 PATH 中,填入绝对路径,例如: # CHROMEDRIVER_PATH = r"E:\drivers\chromedriver.exe" CHROMEDRIVER_PATH: Optional[str] = None # None 表示使用系统 PATH 中的 chromedriver REQUEST_TIMEOUT = 12 SELENIUM_PAGELOAD_TIMEOUT = 20 SELENIUM_RENDER_WAIT = 3 # 页面 readyState 完成后额外等待秒数,给 JS 更多时间 PAUSE_BETWEEN_REQUESTS = 0.4 MAX_CHARS = 15000 # 限制保存的文本长度,避免极大页面带来 IO 压力 MIN_VALID_LENGTH = 100 # 小于该长度视为无效文本,触发 Selenium fallback COMMON_HEADERS = { "User-Agent": ( "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/118.0 Safari/537.36" ), "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", "Accept-Encoding": "gzip, deflate, br", "Connection": "keep-alive", } # 多语言无用短语表(可扩展) USELESS_PHRASES_DICT = { 'zh': {"首页", "主页", "返回首页", "联系我们", "关于我们", "隐私政策", "数据保护", "版权声明", "下一页", "上一页", "更多", "返回顶部", "订阅", "分享", "帮助中心", "服务条款", "免责声明", "广告服务", "反馈建议"}, 'en': {"home", "homepage", "contact", "contact us", "about", "about us", "privacy", "privacy policy", "terms of service", "terms and conditions", "sitemap", "navigation", "register", "login", "sign in", "sign up", "next", "previous", "more", "back to top", "subscribe", "share", "help", "disclaimer", "advertising", "feedback"}, 'es': {"inicio", "página principal", "contáctenos", "contacto", "sobre nosotros", "política de privacidad", "mapa del sitio", "navegación", "registrarse", "iniciar sesión", "siguiente", "anterior", "más", "volver arriba", "suscribirse", "compartir", "ayuda", "aviso legal", "publicidad"}, 'fr': {"accueil", "page d'accueil", "contact", "contactez-nous", "à propos", "politique de confidentialité", "plan du site", "navigation", "s'inscrire", "connexion", "suivant", "précédent", "plus", "haut de page", "s'abonner", "partager", "aide", "mention légale", "publicité"}, 'de': {"startseite", "kontakt", "über uns", "datenschutz", "datenschutzerklärung", "seitenverzeichnis", "navigation", "registrieren", "anmelden", "weiter", "zurück", "mehr", "nach oben", "abonnieren", "teilen", "hilfe", "haftungsausschluss", "werbung"} } # ==== 语言检测和文本过滤 ==== def detect_lang(html_text: str) -> str: """优先用 <html lang=""> 检测语言,否则用 langdetect""" soup = BeautifulSoup(html_text, "html.parser") lang_attr = soup.html.get("lang") if soup.html else None if lang_attr: return lang_attr.split("-")[0].lower() try: return detect(soup.get_text()) except: return 'en' # 默认英文 def is_useful_text(text: str, lang: str = 'en') -> bool: """判断文本是否有用""" text = text.strip() if len(text) <= 2: return False if re.fullmatch(r"[\d\W_]+", text): # 全是数字/符号 return False useless_set = USELESS_PHRASES_DICT.get(lang, set()) if text.lower() in useless_set: return False return True def extract_useful_text_from_soup(soup: BeautifulSoup, lang: str) -> str: """从BeautifulSoup对象中提取有用文本""" # 提取不同类型的内容 data = { "title": "", "meta_description": "", "meta_keywords": "", "headings": [], "paragraphs": [], "alt_texts": [], "link_texts": [], "table_contents": [], "list_items": [] } # 标 if soup.title and is_useful_text(soup.title.string, lang): data["title"] = soup.title.string.strip() # meta 描述 & 关键词 desc_tag = soup.find("meta", attrs={"name": "description"}) if desc_tag and desc_tag.get("content") and is_useful_text(desc_tag["content"], lang): data["meta_description"] = desc_tag["content"].strip() keywords_tag = soup.find("meta", attrs={"name": "keywords"}) if keywords_tag and keywords_tag.get("content") and is_useful_text(keywords_tag["content"], lang): data["meta_keywords"] = keywords_tag["content"].strip() # H 标签 for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: for h in soup.find_all(tag): txt = re.sub(r"\s+", " ", h.get_text(strip=True)) if is_useful_text(txt, lang): data["headings"].append(txt) # 段落 for p in soup.find_all("p"): txt = re.sub(r"\s+", " ", p.get_text(strip=True)) if is_useful_text(txt, lang): data["paragraphs"].append(txt) # 图片 alt for img in soup.find_all("img"): alt = img.get("alt") if alt and is_useful_text(alt, lang): data["alt_texts"].append(alt.strip()) # 链接文本 for a in soup.find_all("a"): txt = re.sub(r"\s+", " ", a.get_text(strip=True)) if is_useful_text(txt, lang): data["link_texts"].append(txt) # 表格内容 for table in soup.find_all("table"): for row in table.find_all("tr"): cells = [re.sub(r"\s+", " ", c.get_text(strip=True)) for c in row.find_all(["td", "th"])] cells = [c for c in cells if is_useful_text(c, lang)] if cells: data["table_contents"].append(" | ".join(cells)) # 列表项 for li in soup.find_all("li"): txt = re.sub(r"\s+", " ", li.get_text(strip=True)) if is_useful_text(txt, lang): data["list_items"].append(txt) # 合并所有有用文本 combined_parts = [ data["title"], data["meta_description"], data["meta_keywords"], *data["headings"], *data["paragraphs"], *data["alt_texts"], *data["link_texts"], *data["table_contents"], *data["list_items"] ] combined_text = "\n".join(filter(None, combined_parts)) return combined_text # ==== Text cleaning for BERT ==== def clean_text_for_bert(text: str) -> str: # 去除多余空白与特殊符号 text = re.sub(r"\s+", " ", text) text = re.sub(r"[^\w\s.,!?;:()\[\]\'\"-]", "", text) text = text.strip() # 限长 if len(text) > MAX_CHARS: text = text[:MAX_CHARS] return text # ==== Requests-first fetch(已删除SKIP_KEYWORDS判断)==== def fetch_with_requests(url: str) -> str: try: if not url.startswith(("http://", "https://")): url = "https://" + url # 强制 HTTPS,提高成功率 # (已删除)原SKIP_KEYWORDS过滤逻辑:if any(k in url for k in SKIP_KEYWORDS): return "[SKIPPED...]" session = requests.Session() resp = session.get(url, headers=COMMON_HEADERS, timeout=REQUEST_TIMEOUT) # 关键改进:使用自动检测的编码来避免乱码 resp.encoding = resp.apparent_encoding resp.raise_for_status() # 检测页面语言 lang = detect_lang(resp.text) soup = BeautifulSoup(resp.text, "html.parser") # 删除无关标签 for tag in soup(["script", "style", "noscript", "iframe", "header", "footer", "nav", "form", "aside"]): tag.decompose() # 提取有用文本 text = extract_useful_text_from_soup(soup, lang) text = clean_text_for_bert(text) return text if text else "[NO TEXT]" except Exception as e: return f"[ERROR_REQUESTS] {e}" # ==== Selenium driver init ==== def init_selenium_driver() -> Optional[webdriver.Chrome]: try: options = Options() options.add_argument("--headless=new") # Chrome 109+ 推荐 options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--window-size=1920,1080") # 减少资源加载(可选):禁用图片、字体等,提升速度 prefs = { "profile.managed_default_content_settings.images": 2, "profile.managed_default_content_settings.stylesheets": 1, "profile.managed_default_content_settings.fonts": 1, "profile.managed_default_content_settings.popups": 2, "profile.managed_default_content_settings.notifications": 2, } options.add_experimental_option("prefs", prefs) options.add_argument(f"user-agent={COMMON_HEADERS['User-Agent']}") if CHROMEDRIVER_PATH: service = Service(CHROMEDRIVER_PATH) else: service = Service() # 使用系统 PATH driver = webdriver.Chrome(service=service, options=options) driver.set_page_load_timeout(SELENIUM_PAGELOAD_TIMEOUT) return driver except WebDriverException as e: print(f"[WARN] Selenium 初始化失败: {e}") return None # ==== Selenium render fetch(已删除SKIP_KEYWORDS判断)==== def fetch_with_selenium(url: str, driver: webdriver.Chrome) -> str: try: if not url.startswith(("http://", "https://")): url = "https://" + url # (已删除)原SKIP_KEYWORDS过滤逻辑:if any(k in url for k in SKIP_KEYWORDS): return "[SKIPPED...]" driver.get(url) # 等待文档就绪 WebDriverWait(driver, SELENIUM_PAGELOAD_TIMEOUT).until( lambda d: d.execute_script("return document.readyState") == "complete" ) # 额外等待 JS 渲染 time.sleep(SELENIUM_RENDER_WAIT) # 获取页面内容和编码 html = driver.page_source lang = detect_lang(html) soup = BeautifulSoup(html, "html.parser") # 删除无关标签 for tag in soup(["script", "style", "noscript", "iframe", "header", "footer", "nav", "form", "aside"]): tag.decompose() # 提取有用文本 text = extract_useful_text_from_soup(soup, lang) text = clean_text_for_bert(text) return text if text else "[NO TEXT]" except TimeoutException as e: return f"[ERROR_SELENIUM_TIMEOUT] {e}" except Exception as e: return f"[ERROR_SELENIUM] {e}" # ==== Unified fetch with fallback ==== def fetch_text(url: str, driver: Optional[webdriver.Chrome]) -> str: # 先尝试 requests text = fetch_with_requests(url) if text.startswith("[ERROR_REQUESTS]") or text == "[NO TEXT]" or len(text) < MIN_VALID_LENGTH: # 尝试 Selenium 渲染 if driver is None: # 无驱动时,直接返回 requests 的结果或错误 return text text2 = fetch_with_selenium(url, driver) # 如果 Selenium 也失败,返回更有信息量的结果 if text2.startswith("[ERROR_SELENIUM"): # 将两个错误拼接,便于审计 return f"{text} | {text2}" # 如果 Selenium 成功或者有更长文本,优先返回 Selenium if text2 != "[NO TEXT]" and len(text2) >= len(text): return text2 return text # ==== Process CSV with real-time saving ==== def process_csv(): driver = init_selenium_driver() # 可能返回 None try: with open(INPUT_FILE, newline="", encoding="utf-8") as infile, \ open(OUTPUT_FILE, "a", newline="", encoding="utf-8") as outfile: reader = csv.reader(infile) writer = csv.writer(outfile) # 若是新文件,写表头 if outfile.tell() == 0: writer.writerow(["id", "url", "text"]) # 跳过输入表头 header = next(reader, None) for row in reader: if len(row) < 2: continue id_, url = row[0], row[1] print(f"正在爬取: {url}") result_text = fetch_text(url, driver) writer.writerow([id_, url, result_text]) outfile.flush() # 实时落盘 time.sleep(PAUSE_BETWEEN_REQUESTS) finally: if driver is not None: try: driver.quit() except Exception: pass if __name__ == "__main__": process_csv() 总结介绍这个代码
09-25
import csv import re import time import random import threading from queue import Queue, Empty import requests from bs4 import BeautifulSoup from langdetect import detect from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.chrome.service import Service from selenium.common.exceptions import WebDriverException, TimeoutException from selenium.webdriver.support.ui import WebDriverWait # ============================== 配置(核心修改:延长Selenium超时+新增重试次数) ============================== INPUT_FILE = "top10milliondomains.csv" OUTPUT_FILE = "层次文本特征.csv" CHROMEDRIVER_PATH = None OUTPUT_COLUMNS = ["id", "url", "title", "meta_description", "meta_keywords", "content"] # 新增:爬取范围控制(1-based行号,表头为第1行,数据从第2行开始) START_ROW = 273700 # 从第几行开始爬取(含此行,例:2=第一个数据行,中断后设为上次结束行号+1) MAX_CRAWL_ROWS = 1000000 # 最多爬取行数(0或负数表示爬取到最后一行) # 原有爬取配置(核心修改:延长Selenium相关超时时间) REQUEST_TIMEOUT = 12 SELENIUM_PAGELOAD_TIMEOUT = 30 # 1. 页面加载超时:从200秒→300秒(应对网速慢) SELENIUM_RENDER_WAIT = 20 # 2. 渲染等待:从10秒→20秒(确保动态内容加载) SELENIUM_RETRY_TIMES = 2 # 3. 新增:超时后重试次数(避免临时网络波动) PAUSE_BETWEEN_REQUESTS = 0.4 MAX_CHARS_PER_COL = 2000 # 其他字段(标、meta)长度限制不变 MIN_VALID_LENGTH = 5 # 多线程配置 THREAD_NUM = 10 TASK_QUEUE = Queue(maxsize=THREAD_NUM * 20) WRITE_LOCK = threading.Lock() # 公共头(优先UTF-8编码) COMMON_HEADERS = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0 Safari/537.36", "Accept-Language": "zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7", "Accept-Encoding": "gzip, deflate, br", "Accept-Charset": "UTF-8,gbk,gb2312", "Connection": "keep-alive", } # 无用短语表(不变) USELESS_PHRASES_DICT = { 'zh': {"首页", "主页", "返回首页", "联系我们", "关于我们", "隐私政策", "数据保护", "版权声明", "下一页", "上一页", "更多", "返回顶部", "订阅", "分享", "帮助中心", "服务条款", "免责声明", "广告服务", "反馈建议"}, 'en': {"home", "homepage", "contact", "contact us", "about", "about us", "privacy", "privacy policy", "terms of service", "terms and conditions", "sitemap", "navigation", "register", "login", "sign in", "sign up", "next", "previous", "more", "back to top", "subscribe", "share", "help", "disclaimer", "advertising", "feedback"}, 'es': {"inicio", "página principal", "contáctenos", "contacto", "sobre nosotros", "política de privacidad", "mapa del sitio", "navegación", "registrarse", "iniciar sesión", "siguiente", "anterior", "más", "volver arriba", "suscribirse", "compartir", "ayuda", "aviso legal", "publicidad"}, 'fr': {"accueil", "page d'accueil", "contact", "contactez-nous", "à propos", "politique de confidentialité", "plan du site", "navigation", "s'inscrire", "connexion", "suivant", "précédent", "plus", "haut de page", "s'abonner", "partager", "aide", "mention légale", "publicité"}, 'de': {"startseite", "kontakt", "über uns", "datenschutz", "datenschutzerklärung", "seitenverzeichnis", "navigation", "registrieren", "anmelden", "weiter", "zurück", "mehr", "nach oben", "abonnieren", "teilen", "hilfe", "haftungsausschluss", "werbung"} } # ============================== 工具函数(不变) ============================== def detect_lang(html_text): """语言检测(优化中文准确性)""" soup = BeautifulSoup(html_text, "html.parser") lang_attr = soup.html.get("lang") if soup.html else None if lang_attr and "zh" in lang_attr: return "zh" try: lang = detect(soup.get_text()) return "zh" if lang in ["zh-cn", "zh-tw", "zh"] else lang except: return 'en' def is_useful_text(text, lang='en'): """文本有效性判断""" text = text.strip() if len(text) <= 2 or re.fullmatch(r"[\d\W_]+", text): return False return text.lower() not in USELESS_PHRASES_DICT.get(lang, set()) def extract_structured_text(soup, lang): """提取结构化文本(核心修改:删除[LINK]/[IMG]/[Hx]等标记,保留纯文本)""" result = { "title": "[NO DATA]", "meta_description": "[NO DATA]", "meta_keywords": "[NO DATA]", "content": "[NO DATA]" } # 1.(处理编码残留) if soup.title and soup.title.string: title = soup.title.string.strip().replace("\u00a0", " ") if is_useful_text(title, lang): result["title"] = title[:MAX_CHARS_PER_COL] # 2. Meta描述 desc_tag = soup.find("meta", attrs={"name": "description"}) if desc_tag and desc_tag.get("content"): meta_desc = desc_tag["content"].strip().replace("\u00a0", " ") if is_useful_text(meta_desc, lang): result["meta_description"] = meta_desc[:MAX_CHARS_PER_COL] # 3. Meta关键词 keywords_tag = soup.find("meta", attrs={"name": "keywords"}) if keywords_tag and keywords_tag.get("content"): meta_key = keywords_tag["content"].strip().replace("\u00a0", " ") if is_useful_text(meta_key, lang): result["meta_keywords"] = meta_key[:MAX_CHARS_PER_COL] # 4. 正文(删除所有类型标记,直接保留纯文本) content_parts = [] # 4.1 H标签(无前缀) for tag in ["h1", "h2", "h3", "h4", "h5", "h6"]: for h in soup.find_all(tag): txt = re.sub(r"\s+", " ", h.get_text(strip=True).replace("\u00a0", " ")) if is_useful_text(txt, lang): content_parts.append(txt) # 4.2 段落文本(无前缀) for p in soup.find_all("p"): txt = re.sub(r"\s+", " ", p.get_text(strip=True).replace("\u00a0", " ")) if is_useful_text(txt, lang) and len(txt) > 5: content_parts.append(txt) # 4.3 补充内容(图片ALT+链接文本,无前缀) extra_parts = [] for img in soup.find_all("img"): alt = img.get("alt") if alt: alt_clean = alt.strip().replace("\u00a0", " ") if is_useful_text(alt_clean, lang): extra_parts.append(alt_clean) for a in soup.find_all("a"): txt = re.sub(r"\s+", " ", a.get_text(strip=True).replace("\u00a0", " ")) if is_useful_text(txt, lang) and len(txt) > 3: extra_parts.append(txt) # 合并正文(暂用原有长度限制,后续截断为500) if content_parts or extra_parts: full_content = "\n".join(content_parts + extra_parts)[:MAX_CHARS_PER_COL] if len(full_content.replace("[NO DATA]", "").strip()) >= MIN_VALID_LENGTH: result["content"] = full_content return result def clean_text(text): """文本清洗(保留中文/英文/数字/常见标点,不变)""" text = re.sub(r"[^\u4e00-\u9fa5a-zA-Z0-9\s.,!?;:()\[\]\'\"-]", "", text) text = re.sub(r"\s+", " ", text).strip() return text[:MAX_CHARS_PER_COL] if len(text) > MAX_CHARS_PER_COL else text # ============================== 爬取函数(核心修改:Selenium超时重试) ============================== def fetch_with_requests(url): """Requests爬取(优化编码)""" try: if not url.startswith(("http://", "https://")): url = "https://" + url resp = requests.get(url, headers=COMMON_HEADERS, timeout=REQUEST_TIMEOUT) # 优化编码识别(解决中文乱码) if resp.encoding and resp.encoding.lower() in ["utf-8", "gbk", "gb2312"]: encoding = resp.encoding else: encoding = resp.apparent_encoding if encoding.lower() in ["iso-8859-1", "latin-1"]: encoding = "UTF-8" try: resp.text.encode("latin-1").decode("utf-8") except UnicodeDecodeError: encoding = "GBK" resp.encoding = encoding html_text = resp.text soup = BeautifulSoup(html_text, "html.parser", from_encoding=encoding) # 删除无用标签 for tag in ["script", "style", "noscript", "iframe", "header", "footer", "nav", "form", "aside"]: for t in soup.find_all(tag): t.decompose() lang = detect_lang(html_text) structured_text = extract_structured_text(soup, lang) return { "title": clean_text(structured_text["title"]), "meta_description": clean_text(structured_text["meta_description"]), "meta_keywords": clean_text(structured_text["meta_keywords"]), "content": clean_text(structured_text["content"]) } except UnicodeDecodeError as e: err_msg = f"编码解码失败:{str(e)[:80]}" return { "title": f"[ERROR_ENCODING] {err_msg}", "meta_description": f"[ERROR_ENCODING] {err_msg}", "meta_keywords": f"[ERROR_ENCODING] {err_msg}", "content": f"[ERROR_ENCODING] {err_msg}" } except Exception as e: err_msg = str(e)[:100].replace(",", ",").replace("\n", " ") return { "title": f"[ERROR_REQUESTS] {err_msg}", "meta_description": f"[ERROR_REQUESTS] {err_msg}", "meta_keywords": f"[ERROR_REQUESTS] {err_msg}", "content": f"[ERROR_REQUESTS] {err_msg}" } def init_selenium_driver(): """初始化Selenium驱动(中文配置,不变)""" try: options = Options() options.add_argument("--headless=new") options.add_argument("--disable-gpu") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--window-size=1920,1080") options.add_argument("--lang=zh-CN") # 中文渲染 options.add_experimental_option("prefs", { "profile.managed_default_content_settings.images": 2, # 禁用图片加载(加快速度) "profile.managed_default_content_settings.stylesheets": 1, "profile.managed_default_content_settings.fonts": 1, "profile.managed_default_content_settings.popups": 2, "profile.managed_default_content_settings.notifications": 2 }) options.add_argument(f"user-agent={COMMON_HEADERS['User-Agent']}") service = Service(CHROMEDRIVER_PATH) if CHROMEDRIVER_PATH else Service() driver = webdriver.Chrome(service=service, options=options) driver.set_page_load_timeout(SELENIUM_PAGELOAD_TIMEOUT) # 应用延长后的加载超时 return driver except WebDriverException as e: print(f"[WARN] Selenium初始化失败: {e}") return None def fetch_with_selenium(url, driver): """Selenium爬取(核心修改:新增超时重试机制)""" retry_count = 0 # 当前重试次数 while retry_count <= SELENIUM_RETRY_TIMES: try: if not url.startswith(("http://", "https://")): url = "https://" + url driver.get(url) # 优化:WebDriverWait超时与页面加载超时一致,避免提前判定超时 WebDriverWait(driver, SELENIUM_PAGELOAD_TIMEOUT).until( lambda d: d.execute_script("return document.readyState") == "complete" ) time.sleep(SELENIUM_RENDER_WAIT) # 应用延长后的渲染等待 # UTF-8编码获取页面源码 html_text = driver.page_source.encode("utf-8", errors="replace").decode("utf-8") soup = BeautifulSoup(html_text, "html.parser", from_encoding="UTF-8") # 删除无用标签 for tag in ["script", "style", "noscript", "iframe", "header", "footer", "nav", "form", "aside"]: for t in soup.find_all(tag): t.decompose() lang = detect_lang(html_text) structured_text = extract_structured_text(soup, lang) return { "title": clean_text(structured_text["title"]), "meta_description": clean_text(structured_text["meta_description"]), "meta_keywords": clean_text(structured_text["meta_keywords"]), "content": clean_text(structured_text["content"]) } except TimeoutException as e: retry_count += 1 if retry_count > SELENIUM_RETRY_TIMES: # 重试次数用尽,返回超时错误 err_msg = f"Timeout(已重试{SELENIUM_RETRY_TIMES}次): {str(e)[:80]}".replace(",", ",").replace("\n", " ") return { "title": f"[ERROR_SEL_TIMEOUT] {err_msg}", "meta_description": f"[ERROR_SEL_TIMEOUT] {err_msg}", "meta_keywords": f"[ERROR_SEL_TIMEOUT] {err_msg}", "content": f"[ERROR_SEL_TIMEOUT] {err_msg}" } # 未用尽重试次数,等待后重试 wait_time = 3 + retry_count * 2 # 重试间隔递增(3s→5s→7s...) print(f"[WARN] Selenium超时,将在{wait_time}秒后重试(第{retry_count}/{SELENIUM_RETRY_TIMES}次):{url}") time.sleep(wait_time) except Exception as e: err_msg = str(e)[:80].replace(",", ",").replace("\n", " ") return { "title": f"[ERROR_SEL] {err_msg}", "meta_description": f"[ERROR_SEL] {err_msg}", "meta_keywords": f"[ERROR_SEL] {err_msg}", "content": f"[ERROR_SEL] {err_msg}" } def fetch_structured_text(url, driver): """统一爬取入口(不变)""" req_result = fetch_with_requests(url) req_valid = not req_result["title"].startswith("[ERROR_REQUESTS]") and \ req_result["title"] != "[NO DATA]" and \ len(req_result["content"].replace("[NO DATA]", "").strip()) >= MIN_VALID_LENGTH if not req_valid and driver: sel_result = fetch_with_selenium(url, driver) return sel_result return req_result # ============================== 线程工作函数(不变:content截断为500字符) ============================== def thread_worker(): driver = init_selenium_driver() try: while True: try: id_, url = TASK_QUEUE.get(block=True, timeout=5) except Empty: print(f"[INFO] 线程{threading.current_thread().name}:任务空,退出") break print(f"[INFO] 线程{threading.current_thread().name}:爬取 {url}") structured_text = fetch_structured_text(url, driver) # 核心修改1:content字段仅保留前500字符(适配Bert 512token限制) truncated_content = structured_text["content"][:500] with WRITE_LOCK: with open(OUTPUT_FILE, "a", newline="", encoding="utf-8-sig") as f: writer = csv.writer(f) writer.writerow([ id_, url, structured_text["title"], structured_text["meta_description"], structured_text["meta_keywords"], truncated_content # 写入截断后的content ]) TASK_QUEUE.task_done() time.sleep(PAUSE_BETWEEN_REQUESTS + random.uniform(0, 0.2)) except Exception as e: print(f"[ERROR] 线程{threading.current_thread().name}:出错 {str(e)[:200]}") finally: if driver: try: driver.quit() print(f"[INFO] 线程{threading.current_thread().name}:关闭Driver") except Exception as e: print(f"[WARN] 线程{threading.current_thread().name}:关闭Driver出错 {e}") # ============================== 主函数(不变:按起始行+总行数加载任务) ============================== def main(): # 初始化输出文件(不变) try: with open(OUTPUT_FILE, "r", encoding="utf-8-sig") as f: first_line = f.readline().strip() if not first_line or first_line != ",".join(OUTPUT_COLUMNS): with open(OUTPUT_FILE, "w", newline="", encoding="utf-8-sig") as f2: csv.writer(f2).writerow(OUTPUT_COLUMNS) except FileNotFoundError: with open(OUTPUT_FILE, "w", newline="", encoding="utf-8-sig") as f: csv.writer(f).writerow(OUTPUT_COLUMNS) # 启动线程(不变) print(f"[INFO] 启动{THREAD_NUM}个爬取线程...") threads = [] for i in range(THREAD_NUM): t = threading.Thread(target=thread_worker, name=f"Worker-{i+1}") t.daemon = True t.start() threads.append(t) print(f"[INFO] 线程{t.name}启动") # 核心修改2:按起始行(START_ROW)和总爬取行数(MAX_CRAWL_ROWS)加载任务 print(f"[INFO] 读取{INPUT_FILE}加载任务...") print(f"[INFO] 爬取配置:起始行={START_ROW},最大爬取行数={MAX_CRAWL_ROWS if MAX_CRAWL_ROWS>0 else '无限制'}") print(f"[INFO] Selenium配置:加载超时={SELENIUM_PAGELOAD_TIMEOUT}s,渲染等待={SELENIUM_RENDER_WAIT}s,超时重试={SELENIUM_RETRY_TIMES}次") task_count = 0 # 已加载的有效任务数 current_row = 0 # 当前读取的行号(1-based,表头为第1行) with open(INPUT_FILE, newline="", encoding="utf-8") as f: reader = csv.reader(f) for row in reader: current_row += 1 # 每行递增行号 # 1. 跳过表头(第1行) if current_row == 1: continue # 2. 跳过起始行之前的行 if current_row < START_ROW: continue # 3. 达到最大爬取行数时停止加载 if MAX_CRAWL_ROWS > 0 and task_count >= MAX_CRAWL_ROWS: print(f"[INFO] 已达到最大爬取行数{MAX_CRAWL_ROWS},停止加载任务") break # 4. 过滤不完整的行(至少包含id和url) if len(row) < 2: print(f"[WARN] 第{current_row}行数据不完整(需至少2列),跳过:{row}") continue # 5. 提取id和url,添加到任务队列 id_, url = row[0], row[1] TASK_QUEUE.put((id_, url), block=True) task_count += 1 # 6. 每加载100个任务打印一次日志 if task_count % 100 == 0: print(f"[INFO] 已加载{task_count}个任务,队列剩余:{TASK_QUEUE.qsize()},当前行号:{current_row}") # 打印任务加载结果 if task_count == 0: print(f"[WARN] 未加载到任何任务!请检查:1.起始行{START_ROW}是否超过数据总行数;2.输入文件是否有有效数据") else: print(f"[INFO] 任务加载完成!共加载{task_count}个任务(当前行号:{current_row})") # 等待所有任务完成(不变) TASK_QUEUE.join() print(f"[INFO] 所有任务完成!") # 等待线程退出(不变) for t in threads: t.join(timeout=10) print(f"[INFO] 所有线程退出,程序结束") if __name__ == "__main__": main() 梳理这个代码的爬取流程
10-17
评论
成就一亿技术人!
拼手气红包6.0元
还能输入1000个字符
 
红包 添加红包
表情包 插入表情
 条评论被折叠 查看
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值