#!/usr/bin/env python3
# coding: utf-8
# File: data_spider.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-10-3
import urllib.request
import urllib.parse
from urllib.error import URLError, HTTPError
from lxml import etree # type: ignore
import pymongo
import re
import os
import random
import time
import configparser
import json
from http.cookiejar import CookieJar
from urllib.request import HTTPCookieProcessor
from typing import Optional, Dict, List
class CrimeSpider:
def __init__(self):
self.conn = pymongo.MongoClient(
host='localhost',
port=27017,
username='root',
password='password'
)
self.db = self.conn['medical']
self.col = self.db['data']
# 加载反爬配置
self.config = self.load_anti_crawl_config()
# 爬虫运行配置
self.timeout = self.config.get('timeout', 12)
self.max_retries = self.config.get('max_retries', 3)
self.base_delay_min = self.config.get('base_delay_min', 1.0)
self.base_delay_max = self.config.get('base_delay_max', 3.0)
self.failure_backoff = self.config.get('failure_backoff', 2.0)
self.max_delay = self.config.get('max_delay', 30.0)
self.adaptive_delay_factor = self.config.get('adaptive_delay_factor', 0.5)
# 如存在,加载 proxy.txt 中的代理池
self.proxy_pool = self.load_proxy_pool()
# User-Agent池
self.user_agents = self.load_user_agents()
self.ua_strategy = self.config.get('ua_strategy', 'random')
self.current_ua_index = 0
# Session管理
self.enable_session = self.config.get('enable_session', True)
if self.enable_session:
self.cookie_jar = CookieJar()
# 请求统计
self.request_count = 0
self.last_request_time = 0
# 重试配置
self.retry_status_codes = set(map(int, str(self.config.get('retry_status_codes', '429,503,502,504,520,521,522,524')).split(',')))
self.proxy_change_codes = set(map(int, str(self.config.get('proxy_change_codes', '403,407,429')).split(',')))
'''根据url,请求html'''
def get_html(self, url):
"""使用可选的随机代理抓取HTML,带完整反爬策略。"""
# 智能延时
self._smart_delay()
# 构建随机化请求头
headers = self._build_random_headers(url)
last_err = None
# 准备待尝试的代理列表
proxies_to_try = []
if self.proxy_pool:
# 最多尝试 self.max_retries 个不同代理
pool_sample = self.proxy_pool.copy()
random.shuffle(pool_sample)
proxies_to_try = pool_sample[: self.max_retries]
# 最后追加一次直连(无代理)尝试
proxies_to_try.append(None)
for attempt, proxy in enumerate(proxies_to_try, start=1):
try:
req = urllib.request.Request(url=url, headers=headers)
# 构建opener,支持Cookie和代理
handlers = []
if self.enable_session:
handlers.append(HTTPCookieProcessor(self.cookie_jar))
if proxy:
handlers.append(urllib.request.ProxyHandler(proxy))
if handlers:
opener = urllib.request.build_opener(*handlers)
res = opener.open(req, timeout=self.timeout)
else:
res = urllib.request.urlopen(req, timeout=self.timeout)
html = res.read().decode('gbk', errors='ignore')
return html
except HTTPError as e:
last_err = e
status_code = e.code
# 根据状态码决定是否更换代理或重试
if self._should_change_proxy(status_code) and proxy and proxy in self.proxy_pool:
try:
self.proxy_pool.remove(proxy)
except ValueError:
pass
# 如果不需要重试,直接跳出
if not self._should_retry(status_code, attempt):
break
# 指数退避
delay = min(self.base_delay_max * (self.failure_backoff ** (attempt - 1)), self.max_delay)
time.sleep(random.uniform(delay * 0.5, delay))
except (URLError, TimeoutError, ConnectionError, OSError) as e:
last_err = e
# 网络错误,移除代理并重试
if proxy and proxy in self.proxy_pool:
try:
self.proxy_pool.remove(proxy)
except ValueError:
pass
# 短暂随机退避,避免频繁请求
time.sleep(random.uniform(0.5, 1.2))
# 若全部尝试失败,抛出最后一次错误
raise last_err or RuntimeError('Failed to fetch url: %s' % url)
def load_proxy_pool(self, path: Optional[str] = None):
"""
从 conf/proxy.txt 加载代理列表。
支持的每行格式:
- http://user:pass@host:port
- https://host:port
- host:port (默认按 http 处理)
以 “#” 开头的行为注释将被忽略。
返回可用于 urllib ProxyHandler 的代理字典列表,例如:
{'http': 'http://host:port', 'https': 'http://host:port'}。
"""
try:
if not path:
# 工程根目录 = 当前文件所在目录的上一层
project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))
path = os.path.join(project_root, 'conf', 'proxy.txt')
if not os.path.isfile(path):
return []
proxies = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
raw = line.strip()
if not raw or raw.startswith('#'):
continue
url = raw
if not raw.startswith('http://') and not raw.startswith('https://'):
url = 'http://' + raw
# 同时构造 http 和 https 两种映射到同一端点
proxies.append({'http': url, 'https': url})
return proxies
except Exception:
return []
'''url解析'''
def url_parser(self, content):
selector = etree.HTML(content)
urls = ['http://www.anliguan.com' + i for i in selector.xpath('//h2[@class="item-title"]/a/@href')]
return urls
'''测试'''
def spider_main(self):
for page in range(1, 11000):
try:
basic_url = 'http://jib.xywy.com/il_sii/%s.htm'%page
cause_url = 'http://jib.xywy.com/il_sii/cause/%s.htm'%page
prevent_url = 'http://jib.xywy.com/il_sii/prevent/%s.htm'%page
symptom_url = 'http://jib.xywy.com/il_sii/symptom/%s.htm'%page
inspect_url = 'http://jib.xywy.com/il_sii/inspect/%s.htm'%page
treat_url = 'http://jib.xywy.com/il_sii/treat/%s.htm'%page
food_url = 'http://jib.xywy.com/il_sii/food/%s.htm'%page
drug_url = 'http://jib.xywy.com/il_sii/drug/%s.htm'%page
data = {}
data['url'] = basic_url
data['basic_info'] = self.basicinfo_spider(basic_url)
data['cause_info'] = self.common_spider(cause_url)
data['prevent_info'] = self.common_spider(prevent_url)
data['symptom_info'] = self.symptom_spider(symptom_url)
data['inspect_info'] = self.inspect_spider(inspect_url)
data['treat_info'] = self.treat_spider(treat_url)
data['food_info'] = self.food_spider(food_url)
data['drug_info'] = self.drug_spider(drug_url)
print(page, basic_url)
self.col.insert(data)
except Exception as e:
print(e, page)
return
'''基本信息解析'''
def basicinfo_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
title = selector.xpath('//title/text()')[0]
category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()')
desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()')
ps = selector.xpath('//div[@class="mt20 articl-know"]/p')
infobox = []
for p in ps:
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
infobox.append(info)
basic_data = {}
basic_data['category'] = category
basic_data['name'] = title.split('的简介')[0]
basic_data['desc'] = desc
basic_data['attributes'] = infobox
return basic_data
'''treat_infobox治疗解析'''
def treat_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
ps = selector.xpath('//div[starts-with(@class,"mt20 articl-know")]/p')
infobox = []
for p in ps:
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
infobox.append(info)
return infobox
'''treat_infobox治疗解析'''
def drug_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
drugs = [i.replace('\n','').replace('\t', '').replace(' ','') for i in selector.xpath('//div[@class="fl drug-pic-rec mr30"]/p/a/text()')]
return drugs
'''food治疗解析'''
def food_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
divs = selector.xpath('//div[@class="diet-img clearfix mt20"]')
try:
food_data = {}
food_data['good'] = divs[0].xpath('./div/p/text()')
food_data['bad'] = divs[1].xpath('./div/p/text()')
food_data['recommand'] = divs[2].xpath('./div/p/text()')
except:
return {}
return food_data
'''症状信息解析'''
def symptom_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
symptoms = selector.xpath('//a[@class="gre" ]/text()')
ps = selector.xpath('//p')
detail = []
for p in ps:
info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','')
detail.append(info)
symptoms_data = {}
symptoms_data['symptoms'] = symptoms
symptoms_data['symptoms_detail'] = detail
return symptoms, detail
'''检查信息解析'''
def inspect_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
inspects = selector.xpath('//li[@class="check-item"]/a/@href')
return inspects
'''通用解析模块'''
def common_spider(self, url):
html = self.get_html(url)
selector = etree.HTML(html)
ps = selector.xpath('//p')
infobox = []
for p in ps:
info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '')
if info:
infobox.append(info)
return '\n'.join(infobox)
'''检查项抓取模块'''
def inspect_crawl(self):
for page in range(1, 3685):
try:
url = 'http://jck.xywy.com/jc_%s.html'%page
html = self.get_html(url)
data = {}
data['url']= url
data['html'] = html
self.db['jc'].insert(data)
print(url)
except Exception as e:
print(e)
def load_anti_crawl_config(self) -> Dict:
"""加载反爬配置文件。"""
config_path = os.path.join(os.path.dirname(__file__), '..', 'conf', 'anti_crawl.conf')
config = {}
if os.path.isfile(config_path):
try:
parser = configparser.ConfigParser()
parser.read(config_path, encoding='utf-8')
# 解析各个配置段
for section in parser.sections():
for key, value in parser[section].items():
# 处理不同数据类型
if value.lower() in ['true', 'false']:
config[key] = value.lower() == 'true'
elif ',' in value and key in ['retry_status_codes', 'proxy_change_codes']:
config[key] = value
else:
try:
config[key] = float(value)
except ValueError:
config[key] = value
except Exception:
pass
return config
def load_user_agents(self) -> List[str]:
"""加载User-Agent池。"""
return [
# Chrome Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36',
# Firefox Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:119.0) Gecko/20100101 Firefox/119.0',
# Safari macOS
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
# Edge Windows
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0',
]
def _get_random_user_agent(self) -> str:
"""获取随机User-Agent。"""
if self.ua_strategy == 'sequential':
ua = self.user_agents[self.current_ua_index % len(self.user_agents)]
self.current_ua_index += 1
return ua
else:
return random.choice(self.user_agents)
def _build_random_headers(self, url: str) -> Dict[str, str]:
"""构建随机化请求头。"""
headers = {
'User-Agent': self._get_random_user_agent(),
'Accept': random.choice([
'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
]),
'Accept-Language': random.choice([
'zh-CN,zh;q=0.9,en;q=0.8',
'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2',
'zh-CN,zh;q=0.9'
]),
'Accept-Encoding': 'gzip, deflate, br',
'DNT': str(random.choice([0, 1])),
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
}
# 根据配置添加Referer
referer_strategy = self.config.get('referer_strategy', 'target')
if referer_strategy == 'target':
from urllib.parse import urlparse
parsed = urlparse(url)
headers['Referer'] = f"{parsed.scheme}://{parsed.netloc}/"
elif referer_strategy == 'random':
headers['Referer'] = random.choice([
'https://www.google.com/',
'https://www.baidu.com/',
'https://www.bing.com/',
])
return headers
def _smart_delay(self):
"""智能延时策略。"""
current_time = time.time()
# 计算基础延时
base_delay = random.uniform(self.base_delay_min, self.base_delay_max)
# 如果有上次请求时间,考虑自适应延时
if self.last_request_time > 0:
elapsed = current_time - self.last_request_time
if elapsed < base_delay:
time.sleep(base_delay - elapsed)
else:
time.sleep(base_delay)
self.last_request_time = time.time()
self.request_count += 1
def _should_retry(self, status_code: int, attempt: int) -> bool:
"""判断是否需要重试。"""
if attempt >= self.max_retries:
return False
return status_code in self.retry_status_codes
def _should_change_proxy(self, status_code: int) -> bool:
"""判断是否需要更换代理。"""
return status_code in self.proxy_change_codes
# handler = CrimeSpider()
# handler.inspect_crawl()