爬取代理ip网站

最新推荐文章于 2024-03-30 20:41:26 发布

转载最新推荐文章于 2024-03-30 20:41:26 发布 · 174 阅读

0 ·

CC 4.0 BY-SA版权

原文链接：http://www.cnblogs.com/perfey/p/10402531.html

文章标签：

#python #json

import requests
from lxml import etree
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
import re
from multiprocessing.dummy import Pool

"""
爬取http://www.goubanjia.com/  ip代理网站
此网站的反爬机制有在显示ip的标签中伪造了dispaly:none的误导信息,使用了js来更改端口号
采取的破解策略为使用selenium无头浏览器,然后使用xpath解析过滤掉误导信息
"""

chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--disable-gpu')

browser = webdriver.Chrome(chrome_options=chrome_options)

# 上网
url = 'http://www.goubanjia.com'
browser.get(url)
# time.sleep(3)
page_text = browser.page_source
tree = etree.HTML(page_text)

browser.quit()

pool = Pool(10)

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.181 Safari/537.36',
}

ip_list = []
right_list = []
# parser = etree.HTMLParser(encoding="utf-8")
# tree = etree.parse("ip.html", parser=parser)  # 将html文档或者xml文档转换成一个etree对象
# tree = etree.HTML(page_text)  # 读取字符串
tree_list = tree.xpath('//*[@id="services"]/div/div[2]/div/div/div/table//tr')
for i in tree_list[1:]:
    ip = "".join(i.xpath('./td[1]//*[@style!="display: none"]/text() |  ./td[1]/text() | ./td[1]/span[last()]/text()'))
    level = "".join(i.xpath('./td[2]//text()'))
    type = "".join(i.xpath('./td[3]//text()'))
    address = "".join(i.xpath('./td[4]/a//text()')).replace(" ", "")
    ip_list.append({"ip": ip, "level": level, "type": type, "address": address})

print(ip_list)

# 使用线程池爬取
def test_ip(dic):
    test_url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
    try:
        response = requests.get(test_url, headers=headers, proxies={dic["type"]: dic["ip"]})
        tree = etree.HTML(response.text)
        li = tree.xpath('//div[@id="1"]/div[1]/div[1]/div[2]/table//tr/td//text()')
        ip = "".join(li).replace(' ', '')
        if re.findall('[\d\.]+', ip)[0] == dic["ip"].split(":")[0]:
            right_list.append(dic)
    except Exception as e:
        print(e)

# 使用https://httpbin.org/get网站验证,这个比较好
def test_ip2(dic):
    test_url = 'https://httpbin.org/get'
    try:
        response = requests.get(test_url, headers=headers, proxies={dic["type"]: dic["ip"]})
        if response.json()["origin"].split(",")[0] == dic["ip"].split(":")[0]:
            right_list.append(dic)
    except Exception as e:
        print(e)
'''
这是一开始不使用线程池爬取的,很慢,加了线程池之后还不是很快
for dic in ip_list:
    test_url = 'http://www.baidu.com/s?ie=UTF-8&wd=ip'
    try:
        response = requests.get(test_url, headers=headers, proxies={dic["type"]: dic["ip"]})
        tree = etree.HTML(response.text)
        li = tree.xpath('//div[@id="1"]/div[1]/div[1]/div[2]/table//tr/td//text()')
        ip = "".join(li).replace(' ', '')
        if re.findall('[\d\.]+', ip)[0] == dic["ip"].split(":")[0]:
            right_list.append(dic)
    except Exception as e:
        print(e)
'''
pool.map(test_ip2,ip_list)
print(right_list)

转载于:https://www.cnblogs.com/perfey/p/10402531.html