代理抓取

最新推荐文章于 2025-05-19 06:21:47 发布

原创最新推荐文章于 2025-05-19 06:21:47 发布 · 301 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#proxy

python 专栏收录该内容

7 篇文章

订阅专栏

本文介绍了一种使用Python爬取西刺代理网站上的代理IP并验证其有效性的方法。通过解析网页源代码，获取代理IP地址、端口及协议类型，然后利用多线程进行高效连接测试，筛选出可用的高匿名代理。最后将可用代理保存到本地文件，为后续的数据抓取任务提供稳定可靠的网络通道。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

#!/usr/bin/python 
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup

url = 'http://www.xicidaili.com/nn/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
of = open('xicidaili.txt' , 'w')

for page in range(1, 60):# 基本60页单IP有限制
    r = requests.get(url + str(page), headers = headers)
    r.encoding = 'utf-8'
    html = r.text

    soup = BeautifulSoup(html, 'html.parser')
    trs = soup.find('table', id ='ip_list').find_all('tr')
    for tr in trs[1:]:
        tds = tr.find_all('td')
        ip = tds[1].text.strip()
        port = tds[2].text.strip()
        protocol = tds[5].text.strip()
        if protocol == 'HTTP' or protocol == 'HTTPS':
            of.write('%s=%s:%s\n' % (protocol, ip, port) )
            print('%s=%s:%s' % (protocol, ip, port))

#!/usr/bin/python 
# -*- coding: utf-8 -*-
import threading
import requests
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
inFile = open('xicidaili.txt', 'r')
outFile = open('available.txt', 'w')

lock = threading.Lock()
count = 0
availableCount = 0
totalLine = len(inFile.readlines())
inFile.seek(0, 0)

def test():
    global count
    global availableCount
    while True:
        lock.acquire()
        count += 1
        line = inFile.readline().strip()# 读一行其他线程等待
        lock.release()
        if len(line) == 0: 
            print('[break] count', count, '/', totalLine)
            break

        protocol, proxy = line.split('=')

        try:
            r = requests.get('http://2018.ip138.com/ic.asp', proxies = {'http':proxy}, headers = headers, timeout = 0.5)
            html = r.content
            html = str(html,'gbk')
        except:
            print('[except] connect failed', count, '/', totalLine)
        else:
            if  html.find('来自：') != -1:
                availableCount += 1
                soup = BeautifulSoup(html, 'html.parser')
                center = soup.find('center').text.strip()
                print(protocol, proxy, center, '[' + availableCount + ']')
                lock.acquire()
                outFile.write(proxy + '\r\n')# 写一行其他线程等待
                lock.release()
            else:
                print('[else] connect failed', count, '/', totalLine)

all_thread = []
for i in range(1):
    t = threading.Thread(target=test)
    all_thread.append(t)
    t.start()
    
for t in all_thread:# 所有线程执行完毕后才关闭两个文件
    t.join()

inFile.close()
outFile.close()

抓取可用高匿名代理，或者CC