pymongo使用socks5代理

本文介绍了一种利用Python的pymongo库连接MongoDB数据库的方法,并通过设置SOCKS5代理来实现网络请求。具体展示了如何配置SOCKS5代理以确保所有通过socket模块发出的请求都经过指定的代理服务器。

from pymongo import MongoClient
import socket
import socks
socks.set_default_proxy(socks.SOCKS5, "localhost", 8899)

socket.socket = socks.socksocket

data = MongoClient('172.18.0.2').db.web.find({})

#!/usr/bin/env python3 # coding: utf-8 # File: data_spider.py # Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io> # Date: 18-10-3 import urllib.request import urllib.parse from urllib.error import URLError, HTTPError from lxml import etree # type: ignore import pymongo import re import os import random import time import configparser import json from http.cookiejar import CookieJar from urllib.request import HTTPCookieProcessor from typing import Optional, Dict, List class CrimeSpider: def __init__(self): self.conn = pymongo.MongoClient( host='localhost', port=27017, username='root', password='password' ) self.db = self.conn['medical'] self.col = self.db['data'] # 加载反爬配置 self.config = self.load_anti_crawl_config() # 爬虫运行配置 self.timeout = self.config.get('timeout', 12) self.max_retries = self.config.get('max_retries', 3) self.base_delay_min = self.config.get('base_delay_min', 1.0) self.base_delay_max = self.config.get('base_delay_max', 3.0) self.failure_backoff = self.config.get('failure_backoff', 2.0) self.max_delay = self.config.get('max_delay', 30.0) self.adaptive_delay_factor = self.config.get('adaptive_delay_factor', 0.5) # 如存在,加载 proxy.txt 中的代理池 self.proxy_pool = self.load_proxy_pool() # User-Agent池 self.user_agents = self.load_user_agents() self.ua_strategy = self.config.get('ua_strategy', 'random') self.current_ua_index = 0 # Session管理 self.enable_session = self.config.get('enable_session', True) if self.enable_session: self.cookie_jar = CookieJar() # 请求统计 self.request_count = 0 self.last_request_time = 0 # 重试配置 self.retry_status_codes = set(map(int, str(self.config.get('retry_status_codes', '429,503,502,504,520,521,522,524')).split(','))) self.proxy_change_codes = set(map(int, str(self.config.get('proxy_change_codes', '403,407,429')).split(','))) '''根据url,请求html''' def get_html(self, url): """使用可选的随机代理抓取HTML,带完整反爬策略。""" # 智能延时 self._smart_delay() # 构建随机化请求头 headers = self._build_random_headers(url) last_err = None # 准备待尝试的代理列表 proxies_to_try = [] if self.proxy_pool: # 最多尝试 self.max_retries 个不同代理 pool_sample = self.proxy_pool.copy() random.shuffle(pool_sample) proxies_to_try = pool_sample[: self.max_retries] # 最后追加一次直连(无代理)尝试 proxies_to_try.append(None) for attempt, proxy in enumerate(proxies_to_try, start=1): try: req = urllib.request.Request(url=url, headers=headers) # 构建opener,支持Cookie和代理 handlers = [] if self.enable_session: handlers.append(HTTPCookieProcessor(self.cookie_jar)) if proxy: handlers.append(urllib.request.ProxyHandler(proxy)) if handlers: opener = urllib.request.build_opener(*handlers) res = opener.open(req, timeout=self.timeout) else: res = urllib.request.urlopen(req, timeout=self.timeout) html = res.read().decode('gbk', errors='ignore') return html except HTTPError as e: last_err = e status_code = e.code # 根据状态码决定是否更换代理或重试 if self._should_change_proxy(status_code) and proxy and proxy in self.proxy_pool: try: self.proxy_pool.remove(proxy) except ValueError: pass # 如果不需要重试,直接跳出 if not self._should_retry(status_code, attempt): break # 指数退避 delay = min(self.base_delay_max * (self.failure_backoff ** (attempt - 1)), self.max_delay) time.sleep(random.uniform(delay * 0.5, delay)) except (URLError, TimeoutError, ConnectionError, OSError) as e: last_err = e # 网络错误,移除代理并重试 if proxy and proxy in self.proxy_pool: try: self.proxy_pool.remove(proxy) except ValueError: pass # 短暂随机退避,避免频繁请求 time.sleep(random.uniform(0.5, 1.2)) # 若全部尝试失败,抛出最后一次错误 raise last_err or RuntimeError('Failed to fetch url: %s' % url) def load_proxy_pool(self, path: Optional[str] = None): """ 从 conf/proxy.txt 加载代理列表。 支持的每行格式: - http://user:pass@host:port - https://host:port - host:port (默认按 http 处理) 以 “#” 开头的行为注释将被忽略。 返回可用于 urllib ProxyHandler 的代理字典列表,例如: {'http': 'http://host:port', 'https': 'http://host:port'}。 """ try: if not path: # 工程根目录 = 当前文件所在目录的上一层 project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), '..')) path = os.path.join(project_root, 'conf', 'proxy.txt') if not os.path.isfile(path): return [] proxies = [] with open(path, 'r', encoding='utf-8') as f: for line in f: raw = line.strip() if not raw or raw.startswith('#'): continue url = raw if not raw.startswith('http://') and not raw.startswith('https://'): url = 'http://' + raw # 同时构造 http 和 https 两种映射到同一端点 proxies.append({'http': url, 'https': url}) return proxies except Exception: return [] '''url解析''' def url_parser(self, content): selector = etree.HTML(content) urls = ['http://www.anliguan.com' + i for i in selector.xpath('//h2[@class="item-title"]/a/@href')] return urls '''测试''' def spider_main(self): for page in range(1, 11000): try: basic_url = 'http://jib.xywy.com/il_sii/%s.htm'%page cause_url = 'http://jib.xywy.com/il_sii/cause/%s.htm'%page prevent_url = 'http://jib.xywy.com/il_sii/prevent/%s.htm'%page symptom_url = 'http://jib.xywy.com/il_sii/symptom/%s.htm'%page inspect_url = 'http://jib.xywy.com/il_sii/inspect/%s.htm'%page treat_url = 'http://jib.xywy.com/il_sii/treat/%s.htm'%page food_url = 'http://jib.xywy.com/il_sii/food/%s.htm'%page drug_url = 'http://jib.xywy.com/il_sii/drug/%s.htm'%page data = {} data['url'] = basic_url data['basic_info'] = self.basicinfo_spider(basic_url) data['cause_info'] = self.common_spider(cause_url) data['prevent_info'] = self.common_spider(prevent_url) data['symptom_info'] = self.symptom_spider(symptom_url) data['inspect_info'] = self.inspect_spider(inspect_url) data['treat_info'] = self.treat_spider(treat_url) data['food_info'] = self.food_spider(food_url) data['drug_info'] = self.drug_spider(drug_url) print(page, basic_url) self.col.insert(data) except Exception as e: print(e, page) return '''基本信息解析''' def basicinfo_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) title = selector.xpath('//title/text()')[0] category = selector.xpath('//div[@class="wrap mt10 nav-bar"]/a/text()') desc = selector.xpath('//div[@class="jib-articl-con jib-lh-articl"]/p/text()') ps = selector.xpath('//div[@class="mt20 articl-know"]/p') infobox = [] for p in ps: info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','') infobox.append(info) basic_data = {} basic_data['category'] = category basic_data['name'] = title.split('的简介')[0] basic_data['desc'] = desc basic_data['attributes'] = infobox return basic_data '''treat_infobox治疗解析''' def treat_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) ps = selector.xpath('//div[starts-with(@class,"mt20 articl-know")]/p') infobox = [] for p in ps: info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','') infobox.append(info) return infobox '''treat_infobox治疗解析''' def drug_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) drugs = [i.replace('\n','').replace('\t', '').replace(' ','') for i in selector.xpath('//div[@class="fl drug-pic-rec mr30"]/p/a/text()')] return drugs '''food治疗解析''' def food_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) divs = selector.xpath('//div[@class="diet-img clearfix mt20"]') try: food_data = {} food_data['good'] = divs[0].xpath('./div/p/text()') food_data['bad'] = divs[1].xpath('./div/p/text()') food_data['recommand'] = divs[2].xpath('./div/p/text()') except: return {} return food_data '''症状信息解析''' def symptom_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) symptoms = selector.xpath('//a[@class="gre" ]/text()') ps = selector.xpath('//p') detail = [] for p in ps: info = p.xpath('string(.)').replace('\r','').replace('\n','').replace('\xa0', '').replace(' ', '').replace('\t','') detail.append(info) symptoms_data = {} symptoms_data['symptoms'] = symptoms symptoms_data['symptoms_detail'] = detail return symptoms, detail '''检查信息解析''' def inspect_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) inspects = selector.xpath('//li[@class="check-item"]/a/@href') return inspects '''通用解析模块''' def common_spider(self, url): html = self.get_html(url) selector = etree.HTML(html) ps = selector.xpath('//p') infobox = [] for p in ps: info = p.xpath('string(.)').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(' ','').replace('\t', '') if info: infobox.append(info) return '\n'.join(infobox) '''检查项抓取模块''' def inspect_crawl(self): for page in range(1, 3685): try: url = 'http://jck.xywy.com/jc_%s.html'%page html = self.get_html(url) data = {} data['url']= url data['html'] = html self.db['jc'].insert(data) print(url) except Exception as e: print(e) def load_anti_crawl_config(self) -> Dict: """加载反爬配置文件。""" config_path = os.path.join(os.path.dirname(__file__), '..', 'conf', 'anti_crawl.conf') config = {} if os.path.isfile(config_path): try: parser = configparser.ConfigParser() parser.read(config_path, encoding='utf-8') # 解析各个配置段 for section in parser.sections(): for key, value in parser[section].items(): # 处理不同数据类型 if value.lower() in ['true', 'false']: config[key] = value.lower() == 'true' elif ',' in value and key in ['retry_status_codes', 'proxy_change_codes']: config[key] = value else: try: config[key] = float(value) except ValueError: config[key] = value except Exception: pass return config def load_user_agents(self) -> List[str]: """加载User-Agent池。""" return [ # Chrome Windows 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36', # Firefox Windows 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:120.0) Gecko/20100101 Firefox/120.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:119.0) Gecko/20100101 Firefox/119.0', # Safari macOS 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.1 Safari/605.1.15', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', # Edge Windows 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36 Edg/120.0.0.0', ] def _get_random_user_agent(self) -> str: """获取随机User-Agent。""" if self.ua_strategy == 'sequential': ua = self.user_agents[self.current_ua_index % len(self.user_agents)] self.current_ua_index += 1 return ua else: return random.choice(self.user_agents) def _build_random_headers(self, url: str) -> Dict[str, str]: """构建随机化请求头。""" headers = { 'User-Agent': self._get_random_user_agent(), 'Accept': random.choice([ 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' ]), 'Accept-Language': random.choice([ 'zh-CN,zh;q=0.9,en;q=0.8', 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'zh-CN,zh;q=0.9' ]), 'Accept-Encoding': 'gzip, deflate, br', 'DNT': str(random.choice([0, 1])), 'Connection': 'keep-alive', 'Upgrade-Insecure-Requests': '1', } # 根据配置添加Referer referer_strategy = self.config.get('referer_strategy', 'target') if referer_strategy == 'target': from urllib.parse import urlparse parsed = urlparse(url) headers['Referer'] = f"{parsed.scheme}://{parsed.netloc}/" elif referer_strategy == 'random': headers['Referer'] = random.choice([ 'https://www.google.com/', 'https://www.baidu.com/', 'https://www.bing.com/', ]) return headers def _smart_delay(self): """智能延时策略。""" current_time = time.time() # 计算基础延时 base_delay = random.uniform(self.base_delay_min, self.base_delay_max) # 如果有上次请求时间,考虑自适应延时 if self.last_request_time > 0: elapsed = current_time - self.last_request_time if elapsed < base_delay: time.sleep(base_delay - elapsed) else: time.sleep(base_delay) self.last_request_time = time.time() self.request_count += 1 def _should_retry(self, status_code: int, attempt: int) -> bool: """判断是否需要重试。""" if attempt >= self.max_retries: return False return status_code in self.retry_status_codes def _should_change_proxy(self, status_code: int) -> bool: """判断是否需要更换代理。""" return status_code in self.proxy_change_codes # handler = CrimeSpider() # handler.inspect_crawl()
10-29
Pymongo是Python中与MongoDB交互的官方驱动程序。下面是一个简单的Pymongo使用教程: 1. 安装Pymongo:首先,确保你已经安装了Python和MongoDB数据库。然后,在命令行中执行以下命令来安装Pymongo: ``` pip install pymongo ``` 2. 连接到MongoDB:在Python脚本中,使用以下代码来连接到MongoDB数据库: ```python from pymongo import MongoClient # 创建MongoDB连接 client = MongoClient('mongodb://localhost:27017/') # 连接到指定数据库 db = client['mydatabase'] ``` 这将连接到本地的MongoDB实例,并选择名为"mydatabase"的数据库。 3. 插入数据:使用`insert_one()`或`insert_many()`方法向集合中插入数据。下面是一个插入单个文档的示例: ```python # 选择集合 collection = db['mycollection'] # 插入单个文档 document = {"name": "John", "age": 30} result = collection.insert_one(document) # 打印插入的文档ID print("插入文档ID:", result.inserted_id) ``` 4. 查询数据:使用`find()`方法来查询集合中的数据。下面是一个查询所有文档的示例: ```python # 查询所有文档 cursor = collection.find() # 遍历结果集并打印文档 for document in cursor: print(document) ``` 5. 更新数据:使用`update_one()`或`update_many()`方法来更新集合中的数据。下面是一个更新单个文档的示例: ```python # 更新文档 filter = {"name": "John"} update = {"$set": {"age": 32}} result = collection.update_one(filter, update) # 打印更新的文档数 print("更新的文档数:", result.modified_count) ``` 6. 删除数据:使用`delete_one()`或`delete_many()`方法来删除集合中的数据。下面是一个删除单个文档的示例: ```python # 删除文档 filter = {"name": "John"} result = collection.delete_one(filter) # 打印删除的文档数 print("删除的文档数:", result.deleted_count) ``` 以上是Pymongo的简单使用教程,你可以根据需要深入学习Pymongo的更多功能和方法。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值