从零开始写一个IP代理池

该代码实现了一个基于Python的HTTP代理池系统,包括代理的获取、存储、过期检查和分配策略。使用PriorityQueue管理代理优先级,支持随机、轮询和LRU三种方式获取多个代理。同时,有两个线程分别负责生产新代理和监控过期代理,Flask应用提供了API接口供外部请求。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >


# -*- encoding: utf-8 -*-
import os
import json
from queue import PriorityQueue
import time
import random
import logging
import threading
import requests
from flask import Flask, jsonify, request

UN_USED_EXPIRED_TIME=5
PER_SIZE_PRODUCER = 10
url = "http://192.168.1.1:50000/V0/get-ip/"

class Proxy:
    def __init__(self, ip, expired_seconds, expired_time):
        self.priority = int(time.time()) + expired_seconds - UN_USED_EXPIRED_TIME
        self.ip = ip
        self.expired_time = expired_time

    def __lt__(self, other):
        return self.priority <= other.priority

    def __gt__(self, other):
        return self.priority > other.priority

    def __str__(self):
        return "Proxy(priority[%s], ip[%s], endtime[%s])" % (self.priority, self.ip, self.expired_time)

class HttpProxyPool:
    def __init__(self):
        self._pressure_ip_proxy_queue = PriorityQueue()
        self._ip_proxies = []
        self._lru_proxies = []
        self._round_robin_index = 0
        self._lock = threading.Lock()

    def put_proxies(self):
        ips = json.loads(requests.get(url % PER_SIZE_PRODUCER).text)["data"]
        for ip in ips:
            host = ip["ip"]
            port = ip["port"]
            expred_seconds = int(ip["timeout"][:ip["timeout"].find("(")])
            expired_time = ip["end_time"]
            proxy = Proxy("%s:%s" % (host, port), expred_seconds, expired_time)

            if not self._exists_proxy(proxy):
                self._put_one_proxy(proxy)
            else:
                logging.info("%s existsed..." % str(proxy))

    def _put_one_proxy(self, proxy):
        try:
            self._lock.acquire()
            self._pressure_ip_proxy_queue.put(proxy)
            self._ip_proxies.append(proxy.ip)
            self._lru_proxies.append(proxy.ip)
        finally:
            self._lock.release()

    def _delete_one_proxy(self, proxy):
        try:
            self._lock.acquire()
            self._ip_proxies.remove(proxy.ip)
            self._lru_proxies.remove(proxy.ip)
        finally:
            self._lock.release()

    def _exists_proxy(self, proxy):
        ip = proxy.ip
        return ip in self._ip_proxies and ip in self._lru_proxies

    def delete_expired_proxies(self):
        while True:
            proxy = self._pressure_ip_proxy_queue.get()
            priority = proxy.priority
            if priority > int(time.time()):
                self._pressure_ip_proxy_queue.put(proxy)
                break
            else:
                # 少量代理可以使用,大量的时候有性能风险
                self._delete_one_proxy(proxy)
                logging.error("%s expired and removed" % str(proxy))

    def random_get_one_proxy(self):
        return random.choice(self._ip_proxies)

    # random
    def random_get_multi_proxies(self, size):
        if size > len(self._ip_proxies):
            return self._ip_proxies
        return [random.choice(self._ip_proxies) for _ in range(size)]

    # round robin
    def round_robin_get_multi_proxies(self, size):
        results = []
        try:
            self._lock.acquire()
            if size >= len(self._ip_proxies):
                results = self._ip_proxies[:]
                self._round_robin_index = 0
            else:
                if self._round_robin_index >= len(self._ip_proxies):
                    self._round_robin_index = 0
                if self._round_robin_index + size < len(self._ip_proxies):
                    results = self._ip_proxies[self._round_robin_index:self._round_robin_index + size]
                    self._round_robin_index = self._round_robin_index + size
                else:
                    results = self._ip_proxies[self._round_robin_index:] + self._ip_proxies[:self._round_robin_index]
                    self._round_robin_index = size - (len(self._ip_proxies) - self._round_robin_index)
        finally:
            self._lock.release()
        return results

    # lru
    def lru_get_multi_proxies(self, size):
        results = []
        try:
            self._lock.acquire()
            if size >= len(self._lru_proxies):
                results = self._ip_proxies[:]
            else:
                results = self._ip_proxies[:size]
                # 再把前面的放到后面
                for _ in range(size):
                    self._ip_proxies.pop(0)
                self._ip_proxies.extend(results)
        finally:
            self._lock.release()
        return results

    def round_robin_index_(self):
        return self._round_robin_index

    def __len__(self):
        return len(self._ip_proxies)

http_proxy_pool = HttpProxyPool()
# 代理监控
class HttpProxyMonitor(threading.Thread):
    def __init__(self, name):
        super(HttpProxyMonitor, self).__init__(name=name)

    def run(self):
        while True:
            try:
                print("开始监控代理")
                http_proxy_pool.delete_expired_proxies()
                logging.info("current pool size[%s], round_robin_index[%s]" % (len(http_proxy_pool), http_proxy_pool.round_robin_index_()))
                print("current pool size[%s], round_robin_index[%s]" % (len(http_proxy_pool), http_proxy_pool.round_robin_index_()))
                time.sleep(UN_USED_EXPIRED_TIME)
            except:
                pass

# 代理生产者
class HttpProxyProducer(threading.Thread):
    def __init__(self, name):
        super(HttpProxyProducer, self).__init__(name=name)

    def run(self):
        while True:
            try:
                print("开始生产代理")
                http_proxy_pool.put_proxies()
                time.sleep(UN_USED_EXPIRED_TIME * 3)
            except:
                pass

producer = HttpProxyProducer("producer")
monitor = HttpProxyMonitor("monitor")
producer.start()
monitor.start()

app = Flask(__name__)
@app.route('/proxy/random/')
def random_get_proxies():       # http://127.0.0.1:8080/proxy/random/?size=2
    size = int(request.args.get('size'))
    proxies = http_proxy_pool.random_get_multi_proxies(size)
    return jsonify(proxies)

@app.route('/proxy/lru/')
def lru_get_proxies():       # http://127.0.0.1:8080/proxy/lru/?size=2
    size = int(request.args.get('size'))
    proxies = http_proxy_pool.lru_get_multi_proxies(size)
    return jsonify(proxies)

@app.route('/proxy/round_robin/')
def round_robin_get_proxies():       # http://127.0.0.1:8080/proxy/round_robin/?size=2
    size = int(request.args.get('size'))
    proxies = http_proxy_pool.round_robin_get_multi_proxies(size)
    return jsonify(proxies)

def init_log_config():
    LOG_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"  # 日志格式化输出
    DATE_FORMAT = "%Y-%m-%d %H:%M:%S %p"  # 日期格式
    if os.name == "posix":
        filename = "/data/log/http_proxy.log"
    else:
        filename = "e:/http_proxy.log"
    fp = logging.FileHandler(filename, encoding='utf-8')
    fs = logging.StreamHandler()
    logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT, handlers=[fp, fs])  # 调用

if __name__ == "__main__":
    app.run(host="0.0.0.0", port=8080)

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值