import requests
from lxml import etree
import time
import multiprocessing
def get_all_proxy(queue):
url = 'http://www.xicidaili.com/nn/1'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
}
response = requests.get(url, headers=headers)
html_ele = etree.HTML(response.text)
ip_eles = html_ele.xpath('//table[@id="ip_list"]/tr/td[2]/text()')
port_ele = html_ele.xpath('//table[@id="ip_list"]/tr/td[3]/text()')
proxy_list = []
for i in range(0,len(ip_eles)):
proxy_str = 'http://' + ip_eles[i] + ':' + port_ele[i]
queue.put(proxy_str)
def check_one_proxy(proxy):
try:
url = 'http://www.baidu.com/s?wd=ip'
proxy_dict = {
'http': proxy
}
try:
response = requests.get(url, proxies=proxy_dict, timeout=5)
if response.status_code == 200:
print('代理可用:' + proxy)
return proxy
else:
print('代理超时')
return proxy
except:
return None
except Exception as e:
print(e)
if __name__ == '__main__':
start_time = time.time()
q = multiprocessing.Queue()
p = multiprocessing.Process(target=get_all_proxy, args=(q,))
p.start()
pool = multiprocessing.Pool(50)
result_list = []
while True:
try:
proxy_str = q.get(timeout=5)
except:
break
proxy_res = pool.apply_async(check_one_proxy, (proxy_str,))
result_list.append(proxy_res)
valid_proxy_list = []
for proxy_res in result_list:
result = proxy_res.get()
if result is None:
pass
else:
valid_proxy_list.append(result)
print('All proxy we can get:')
print(valid_proxy_list)
pool.close()
pool.join()
p.join()
end_time = time.time()
print('--'*30)
print('耗时:' + str(end_time-start_time))