python生成可用代理池

最新推荐文章于 2024-01-22 14:43:18 发布

原创最新推荐文章于 2024-01-22 14:43:18 发布 · 293 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#python

本文介绍如何使用Python从网站如xicidaili.com和66ip.cn抓取代理IP信息，利用BeautifulSoup解析网页，通过多线程验证代理的有效性，并创建一个可用的代理IP池。

python生成可用代理池

- - 下载模块
  - 下载html解析器

下载模块

下载模块BeautifulSoup

pip install beautifulsoup4

下载html解析器

pip install lxml

获取www.xicidaili.com的代理ip信息

requestHeader = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
url = 'http://www.xicidaili.com/nn/'
#获取html页面
info = requests.get(url, headers=requestHeader)
html_doc = info.text
#解析html页面
soup = BeautifulSoup(html_doc, "lxml")
#通过table及id来截取字段
trs = soup.find('table', id='ip_list').find_all('tr')

在这里插入图片描述

#循环获取的table列表来定义值并打印
   for tr in trs[1:]:
            tds = tr.find_all('td')
            if tds[0].find('img') is None:
                nation = '未知'
                locate = '未知'
            else:
                nation = tds[0].find('img')['alt'].strip()
                locate = tds[3].text.strip()
            ip = tds[1].text.strip()  # ip
            port = tds[2].text.strip()  # 端口
            anony = tds[5].text.strip()
            protocol = tds[6].text.strip()
            speed = tds[7].find('div')['title'].strip()
            time = tds[9].text.strip()
            print('%s|%s|%s|%s|%s|%s|%s|%s\n' % (nation, ip, port, locate, anony, protocol, speed, time))

获取www.66ip.cn的代理ip信息

在获取代理ip信息时有个中文乱码的问题，在这里


url="http://www.66ip.cn/"
#通过循环拼接字符url页面来进行分页
for page in range(1, 4):
        url = targeturl + str(page)
        print(url)
        info = requests.get(url)
        #获取页面编码
        print(requests.utils.get_encodings_from_content(info.text))
        #指定页面编码，解决中文乱码问题
        info.encoding = 'gb2312'
        html_doc = info.text
        soup = BeautifulSoup(html_doc,"lxml")
        #这里页面有多个table所以我们获取所有table来索引我们想要的位置
        trs = soup.findAll('table')[2].find_all('tr')
        for ip in trs:
            print(ip)

将获取的ip装入列表然后用多线程请求取出可用代理

if __name__=="__main__":
    lists1 = getProxyList_2()
    lists2 = getProxyList()
    lists = lists1 + lists2
    lists_ip = []
    all_thread = []
    for i in range(0,len(lists)):
        t = threading.Thread(target=ProxyLists,args=(lists[i],))
        all_thread.append(t)
        t.start()

    for t in all_thread:
        t.join()

完整代码

# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import threading
import requests
import http.client
import sys

lock = threading.Lock()


def getProxyList(targeturl="http://www.xicidaili.com/nn/"):
    requestHeader = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
    for page in range(1, 2):
        url = targeturl + str(page)
        print(url)
        info = requests.get(url, headers=requestHeader)
        html_doc = info.text
        #print(html_doc)
        soup = BeautifulSoup(html_doc, "lxml")
        #soup = BeautifulSoup(html_doc, "html.parser")
        trs = soup.find('table', id='ip_list').find_all('tr')
        #print(trs)
        ip_list = []
        for tr in trs[1:]:
            tds = tr.find_all('td')
            if tds[0].find('img') is None:
                nation = '未知'
                locate = '未知'
            else:
                nation = tds[0].find('img')['alt'].strip()
                locate = tds[3].text.strip()
            ip = tds[1].text.strip()  # ip
            port = tds[2].text.strip()  # 端口
            anony = tds[5].text.strip()
            protocol = tds[6].text.strip()
            speed = tds[7].find('div')['title'].strip()
            time = tds[9].text.strip()
            ip_list.append("%s:%s" % (ip, port))
        return ip_list[1:]



def getProxyList_2(targeturl="http://www.66ip.cn/"):
    countNum = 0
    requestHeader = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
    for page in range(1, 2):
        url = targeturl + str(page)
        print(url)
        info = requests.get(url)#, headers=requestHeader)
        #print(requests.utils.get_encodings_from_content(info.text))
        info.encoding = 'gb2312'
        html_doc = info.text
        #print(html_doc)
        soup = BeautifulSoup(html_doc,"lxml")
        #soup = BeautifulSoup(html_doc, "html.parser")
        #print(soup)
        trs = soup.findAll('table')[2].find_all('tr')
        #print(trs)
        ip_list = []
        for tr in trs:
            tds = tr.find_all('td')
            #print(tds)
            ip = tds[0].text.strip()  # ip
            port = tds[1].text.strip()
            ip_list.append("%s:%s"%(ip,port))

        return ip_list[1:]


def ProxyList(lists):
    '''
    验证代理的有效性
    '''
    requestHeader = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
    myurl = 'http://www.baidu.com/'

    for i in lists:
        lock.acquire()
        ip = i.split(":")[0]
        port = i.split(":")[1]
        try:
            requests.get(myurl, proxies={"http": "http://%s:%s"%(ip,port)})
            print("Success:" + ip + ":" + port)
        except:
            print(ip,port+'    不可用')
            lists.remove(i)
        lock.release()
    print(lists)
    return lists

def ProxyLists(lists):
    '''
    验证代理的有效性
    '''
    requestHeader = {
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
    myurl = 'http://www.baidu.com/'
    lock.acquire()
    ip = lists.split(":")[0]
    port = lists.split(":")[1]
    try:
        requests.get(myurl, proxies={"http": "http://%s:%s"%(ip,port)})
        print("Success:" + ip + ":" + port)
        lists_ip.append({ip:port})
    except:
        print(ip,port+'    不可用')
    lock.release()
    print(lists_ip)
    return lists_ip


if __name__=="__main__":
    lists1 = getProxyList_2()
    lists2 = getProxyList()
    lists = lists1 + lists2
    lists_ip = []
    all_thread = []
    for i in range(0,len(lists)):
        t = threading.Thread(target=ProxyLists,args=(lists[i],))
        all_thread.append(t)
        t.start()

    for t in all_thread:
        t.join()