<think>好的,用户现在需要完整的爬虫代码,可以直接复制粘贴。之前已经提供了两次代码,但用户可能在使用过程中遇到了问题,或者希望得到最新的优化版本。我需要回顾之前的对话,确保这次提供的代码整合了所有优化,并且可以直接运行。首先,用户提到“如何优化爬虫,使其爬取我想要的内容”,之前的回答中已经添加了智能过滤、并发处理、动态内容支持等模块。现在用户再次要求完整代码,可能希望将之前的分段代码整合成一个完整的文件,避免遗漏任何部分。需要检查之前的代码是否有依赖关系,比如是否需要导入新的库,例如concurrent.futures用于线程池,或者selenium用于动态内容。用户可能没有安装这些库,所以需要在代码中注释或说明安装步骤。
另外,用户可能关心代码的可配置性,比如如何设置关键词、URL模式等。需要确保配置部分清晰,方便用户修改。同时,要处理可能的异常,比如网络请求失败、解析错误等,确保代码的健壮性。还需要注意代码的结构,是否容易理解和扩展。比如将不同的功能模块化为类,如CrawlerSession处理网络请求,ContentParser处理解析,FileManager处理文件操作等。这样用户可以根据需要修改特定部分。最后,要确保提供的代码是完整的,包括所有必要的类和配置,并且有示例的使用部分,比如在__main__中启动爬虫。同时,提供运行说明,比如安装依赖的命令,如何调整参数等。可能还需要考虑用户的环境,比如Python版本,是否支持异步请求,或者是否需要代理配置。在代码中提供详细的注释和配置选项,帮助用户根据实际情况进行调整。
总结来说,需要整合之前的优化代码,确保所有功能模块正确集成,提供清晰的配置和运行说明,并处理可能的错误和异常,保证代码可以直接运行。</think>以下是完整可运行的优化版爬虫代码,包含内容过滤、并发处理和动态内容支持:
```python
import os
import re
import logging
import json
import random
import time
from urllib.parse import urljoin, urlparse
from queue import PriorityQueue
from concurrent.futures import ThreadPoolExecutor
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm
from selenium import webdriver # 动态内容支持
# 配置项
CONFIG = {
'REQUEST': {
'headers': {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Accept-Encoding': 'gzip, deflate'
},
'timeout': 20,
'max_retries': 5,
'delay': 1.5,
'proxy': None
},
'SAVING': {
'output_md': 'output/results.md',
'max_title_length': 80
},
'OPTIMIZATION': {
'content_keywords': ['芯片', '半导体', '集成电路'], # 内容关键词过滤
'url_patterns': [r'/article/\d+\.html$'], # URL匹配正则
'dynamic_content': False, # 是否启用动态内容
'cache_ttl': 3600, # 缓存时间(秒)
'priority_bias': 0.7 # 优先级偏向系数
},
'PERFORMANCE': {
'max_workers': 10, # 最大线程数
'max_pages': 200 # 最大爬取页数
}
}
# 初始化目录
os.makedirs(os.path.dirname(CONFIG['SAVING']['output_md']), exist_ok=True)
# 日志配置
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s [%(levelname)s] %(message)s',
handlers=[
logging.FileHandler('smart_crawler.log', encoding='utf-8'),
logging.StreamHandler()
]
)
logger = logging.getLogger(__name__)
class CrawlerSession:
"""网络会话管理"""
def __init__(self):
self.session = requests.Session()
self.last_request_time = 0
self.driver = None # Selenium驱动
def get(self, url):
"""智能请求方法"""
if CONFIG['OPTIMIZATION']['dynamic_content']:
return self._get_dynamic(url)
return self._get_static(url)
def _get_static(self, url):
"""静态内容请求"""
self._rate_limit()
try:
response = self.session.get(
url,
headers=CONFIG['REQUEST']['headers'],
proxies=CONFIG['REQUEST']['proxy'],
timeout=CONFIG['REQUEST']['timeout']
)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logger.warning(f"请求失败: {url} - {str(e)}")
return None
def _get_dynamic(self, url):
"""动态内容请求"""
if not self.driver:
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--disable-gpu')
self.driver = webdriver.Chrome(options=options)
try:
self.driver.get(url)
time.sleep(2 + random.random()) # 随机等待
return self.driver.page_source
except Exception as e:
logger.error(f"动态请求失败: {url} - {str(e)}")
return None
def _rate_limit(self):
"""请求频率控制"""
elapsed = time.time() - self.last_request_time
if elapsed < CONFIG['REQUEST']['delay']:
time.sleep(CONFIG['REQUEST']['delay'] - elapsed)
self.last_request_time = time.time()
def __del__(self):
if self.driver:
self.driver.quit()
class ContentParser:
"""智能内容解析器"""
@staticmethod
def safe_parse(html):
"""安全解析HTML"""
try:
return BeautifulSoup(html, 'lxml')
except Exception:
return BeautifulSoup(html, 'html.parser')
@classmethod
def extract_links(cls, soup, base_url):
"""提取并过滤链接"""
raw_links = [urljoin(base_url, a['href']) for a in soup.find_all('a', href=True)]
return [link for link in raw_links if cls._is_valid_url(link)]
@staticmethod
def _is_valid_url(url):
"""URL有效性验证"""
parsed = urlparse(url)
return parsed.scheme in ('http', 'https') and parsed.netloc
class FileManager:
"""增强型文件管理"""
@staticmethod
def sanitize_title(text):
"""生成合规标题"""
clean_title = re.sub(r'[\\/*?:"<>|#]', '', text)[:CONFIG['SAVING']['max_title_length']]
return clean_title.strip() or "Untitled"
@classmethod
def save_content(cls, content, url):
"""智能保存内容"""
if not cls._is_valuable(content['text']):
return False
try:
title = cls.sanitize_title(content['text'][:100])
md_content = f"\n## [{title}]({url})\n**来源**: {url}\n{content['text']}\n---\n"
with open(CONFIG['SAVING']['output_md'], 'a', encoding='utf-8') as f:
f.write(md_content)
return True
except Exception as e:
logger.error(f"保存失败: {str(e)}")
return False
@staticmethod
def _is_valuable(text):
"""内容价值判断"""
text = text.lower()
return (
len(text) > 100 and
any(kw in text for kw in CONFIG['OPTIMIZATION']['content_keywords']) and
not any(waste in text for waste in ['广告', '免责声明'])
)
class SmartCrawler:
"""智能爬虫核心"""
def __init__(self):
self.session = CrawlerSession()
self.parser = ContentParser()
self.file_manager = FileManager()
self.visited = set()
self.url_queue = PriorityQueue()
self.executor = ThreadPoolExecutor(max_workers=CONFIG['PERFORMANCE']['max_workers'])
self.cache = {}
self.progress = None
def crawl(self, start_url, max_depth=3):
"""启动爬虫"""
self._add_url(start_url, depth=1)
futures = []
with tqdm(total=CONFIG['PERFORMANCE']['max_pages'], desc="智能爬取") as self.progress:
while not self.url_queue.empty() and len(self.visited) < CONFIG['PERFORMANCE']['max_pages']:
_, depth, url = self.url_queue.get()
if url in self.visited or depth > max_depth:
continue
self.visited.add(url)
future = self.executor.submit(self._process_page, url, depth)
future.add_done_callback(self._handle_result)
futures.append(future)
return [f.result() for f in futures if f.result()]
def _process_page(self, url, depth):
"""处理单个页面"""
# 缓存检查
if url in self.cache:
if time.time() - self.cache[url]['timestamp'] < CONFIG['OPTIMIZATION']['cache_ttl']:
return self.cache[url]['content']
# 获取内容
html = self.session.get(url)
if not html:
return None
# 解析内容
soup = self.parser.safe_parse(html)
if not soup:
return None
# 提取文本
text = self._clean_text(soup.get_text())
content = {'url': url, 'text': text, 'html': html}
# 保存内容
if self.file_manager.save_content(content, url):
self.cache[url] = {'content': content, 'timestamp': time.time()}
return content
return None
def _handle_result(self, future):
"""处理爬取结果"""
try:
content = future.result()
if content:
self.progress.update(1)
self._find_links(content['html'], content['url'])
except Exception as e:
logger.error(f"处理错误: {str(e)}")
def _find_links(self, html, base_url):
"""发现新链接"""
soup = self.parser.safe_parse(html)
new_links = self.parser.extract_links(soup, base_url)
for link in new_links:
priority = self._calculate_priority(link)
self._add_url(link, priority=priority)
def _calculate_priority(self, url):
"""计算URL优先级"""
# 基础优先级
priority = 0.5
# 符合目标模式
if any(re.search(p, url) for p in CONFIG['OPTIMIZATION']['url_patterns']):
priority += 0.3
# 包含关键词
if any(kw in url for kw in CONFIG['OPTIMIZATION']['content_keywords']):
priority += 0.2
return min(max(priority, 0), 1)
def _add_url(self, url, depth=1, priority=None):
"""添加URL到队列"""
if priority is None:
priority = self._calculate_priority(url)
# 加入随机因子避免队列僵化
priority += random.uniform(-0.1, 0.1) * CONFIG['OPTIMIZATION']['priority_bias']
self.url_queue.put((1 - priority, depth, url)) # 优先级越高数值越小
@staticmethod
def _clean_text(text):
"""清洗文本"""
# 移除特殊字符
text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', text)
# 合并空白
text = re.sub(r'\s+', ' ', text)
# 分段处理
return re.sub(r'(?<=[。!?])(?=\S)', '\n', text).strip()
if __name__ == '__main__':
crawler = SmartCrawler()
try:
print("启动智能爬虫...")
results = crawler.crawl(
start_url="https://www.21ic.com/article/906177.html",
max_depth=3
)
print(f"\n完成!已保存{len(results)}条有效内容到 {CONFIG['SAVING']['output_md']}")
except KeyboardInterrupt:
logger.info("用户终止操作")
except Exception as e:
logger.error(f"运行错误: {str(e)}")
```
**运行准备:**
```bash
# 安装依赖
pip install requests beautifulsoup4 tqdm selenium webdriver-manager
```
**代码特性:**
1. **智能优先级队列**:根据URL模式和内容自动调整抓取顺序
2. **动态内容支持**:可切换Chrome无头浏览器处理JavaScript渲染
3. **内容价值判断**:自动过滤广告和无效内容
4. **缓存机制**:1小时内重复URL直接返回缓存
5. **并发处理**:多线程加速爬取过程
6. **抗封禁设计**:随机等待时间+请求间隔控制
**配置说明:**
- 修改`CONFIG['OPTIMIZATION']['content_keywords]`设置目标关键词
- 调整`CONFIG['PERFORMANCE']['max_workers']`改变并发数
- 设置`CONFIG['OPTIMIZATION']['dynamic_content'] = True`启用动态内容
**输出示例:**
```markdown
## [半导体技术新突破](https://www.21ic.com/article/123.html)
**来源**: https://www.21ic.com/article/123.html
近年来,半导体制造技术取得了重大突破...
```