import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re
from multiprocessing.dummy import Pool
"""
爬取http://www.goubanjia.com/ ip代理网站
此网站的反爬机制有在显示ip的标签中伪造了dispaly:none的误导信息,使用了js来更改端口号
采取的破解策略为使用selenium无头浏览器,然后使用xpath解析过滤掉误导信息
"""
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')
browser = webdriver.Chrome(chrome_options=chrome_options)
# 上网
url = 'http://www.goubanjia.com'
browser.get(url)
# time.sleep(3)
page_text = browser.page_source
tree = etree.HTML(page_text)
browser.quit()
pool = Pool(10)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
}
ip_list = []
right_list = []
# parser = etree.HTMLParser(encoding="utf-8")
# tree = etree.parse("ip.html", parser=parser) # 将html文档或者xml文档转换成一个etree对象
# tree = etree.HTML(page_text) # 读取字符串
tree_list = tree.xpath('//*[@id="services"]/div/div[2]/div/div/div/table//tr')
for i in tree_list[1:]:
ip = "".join(i.xpath('./td[1]//*[@style!="display: none"]/text() | ./td[1]/text() | ./td[1]/span[last()]/text()'))
level = "".join(i.xpath('./td[2]//text()'))
type = "".join(i.xpath('./td[3]//text()'))
address = "".join(i.xpath('./td[4]/a//text()')).replace(" ", "")
ip_list.append({"ip": ip, "level": level, "type": type, "address": address})
print(ip_list)
# 使用线程池爬取
def test_ip(dic):
test_url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
try:
response = requests.get(test_url, headers=headers, proxies={dic["type"]: dic["ip"]})
tree = etree.HTML(response.text)
li = tree.xpath('//div[@id="1"]/div[1]/div[1]/div[2]/table//tr/td//text()')
ip = "".join(li).replace(' ', '')
if re.findall('[\d\.]+', ip)[0] == dic["ip"].split(":")[0]:
right_list.append(dic)
except Exception as e:
print(e)
# 使用https://httpbin.org/get网站验证,这个比较好
def test_ip2(dic):
test_url = 'https://httpbin.org/get'
try:
response = requests.get(test_url, headers=headers, proxies={dic["type"]: dic["ip"]})
if response.json()["origin"].split(",")[0] == dic["ip"].split(":")[0]:
right_list.append(dic)
except Exception as e:
print(e)
'''
这是一开始不使用线程池爬取的,很慢,加了线程池之后还不是很快
for dic in ip_list:
test_url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
try:
response = requests.get(test_url, headers=headers, proxies={dic["type"]: dic["ip"]})
tree = etree.HTML(response.text)
li = tree.xpath('//div[@id="1"]/div[1]/div[1]/div[2]/table//tr/td//text()')
ip = "".join(li).replace(' ', '')
if re.findall('[\d\.]+', ip)[0] == dic["ip"].split(":")[0]:
right_list.append(dic)
except Exception as e:
print(e)
'''
pool.map(test_ip2,ip_list)
print(right_list)