#!/usr/bin/python
# -*- coding: utf-8 -*-
import requests
from bs4 import BeautifulSoup
url = 'http://www.xicidaili.com/nn/'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
of = open('xicidaili.txt' , 'w')
for page in range(1, 60):# 基本60页单IP有限制
r = requests.get(url + str(page), headers = headers)
r.encoding = 'utf-8'
html = r.text
soup = BeautifulSoup(html, 'html.parser')
trs = soup.find('table', id ='ip_list').find_all('tr')
for tr in trs[1:]:
tds = tr.find_all('td')
ip = tds[1].text.strip()
port = tds[2].text.strip()
protocol = tds[5].text.strip()
if protocol == 'HTTP' or protocol == 'HTTPS':
of.write('%s=%s:%s\n' % (protocol, ip, port) )
print('%s=%s:%s' % (protocol, ip, port))
#!/usr/bin/python
# -*- coding: utf-8 -*-
import threading
import requests
from bs4 import BeautifulSoup
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}
inFile = open('xicidaili.txt', 'r')
outFile = open('available.txt', 'w')
lock = threading.Lock()
count = 0
availableCount = 0
totalLine = len(inFile.readlines())
inFile.seek(0, 0)
def test():
global count
global availableCount
while True:
lock.acquire()
count += 1
line = inFile.readline().strip()# 读一行其他线程等待
lock.release()
if len(line) == 0:
print('[break] count', count, '/', totalLine)
break
protocol, proxy = line.split('=')
try:
r = requests.get('http://2018.ip138.com/ic.asp', proxies = {'http':proxy}, headers = headers, timeout = 0.5)
html = r.content
html = str(html,'gbk')
except:
print('[except] connect failed', count, '/', totalLine)
else:
if html.find('来自:') != -1:
availableCount += 1
soup = BeautifulSoup(html, 'html.parser')
center = soup.find('center').text.strip()
print(protocol, proxy, center, '[' + availableCount + ']')
lock.acquire()
outFile.write(proxy + '\r\n')# 写一行其他线程等待
lock.release()
else:
print('[else] connect failed', count, '/', totalLine)
all_thread = []
for i in range(1):
t = threading.Thread(target=test)
all_thread.append(t)
t.start()
for t in all_thread:# 所有线程执行完毕后才关闭两个文件
t.join()
inFile.close()
outFile.close()
抓取可用高匿名代理,或者CC