下载模块
下载模块BeautifulSoup
pip install beautifulsoup4
下载html解析器
pip install lxml
获取www.xicidaili.com的代理ip信息
requestHeader = {'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
url = 'http://www.xicidaili.com/nn/'
#获取html页面
info = requests.get(url, headers=requestHeader)
html_doc = info.text
#解析html页面
soup = BeautifulSoup(html_doc, "lxml")
#通过table及id来截取字段
trs = soup.find('table', id='ip_list').find_all('tr')

#循环获取的table列表来定义值并打印
for tr in trs[1:]:
tds = tr.find_all('td')
if tds[0].find('img') is None:
nation = '未知'
locate = '未知'
else:
nation = tds[0].find('img')['alt'].strip()
locate = tds[3].text.strip()
ip = tds[1].text.strip() # ip
port = tds[2].text.strip() # 端口
anony = tds[5].text.strip()
protocol = tds[6].text.strip()
speed = tds[7].find('div')['title'].strip()
time = tds[9].text.strip()
print('%s|%s|%s|%s|%s|%s|%s|%s\n' % (nation, ip, port, locate, anony, protocol, speed, time))
获取www.66ip.cn的代理ip信息
在获取代理ip信息时有个中文乱码的问题,在这里
url="http://www.66ip.cn/"
#通过循环拼接字符url页面来进行分页
for page in range(1, 4):
url = targeturl + str(page)
print(url)
info = requests.get(url)
#获取页面编码
print(requests.utils.get_encodings_from_content(info.text))
#指定页面编码,解决中文乱码问题
info.encoding = 'gb2312'
html_doc = info.text
soup = BeautifulSoup(html_doc,"lxml")
#这里页面有多个table所以我们获取所有table来索引我们想要的位置
trs = soup.findAll('table')[2].find_all('tr')
for ip in trs:
print(ip)
将获取的ip装入列表然后用多线程请求取出可用代理
if __name__=="__main__":
lists1 = getProxyList_2()
lists2 = getProxyList()
lists = lists1 + lists2
lists_ip = []
all_thread = []
for i in range(0,len(lists)):
t = threading.Thread(target=ProxyLists,args=(lists[i],))
all_thread.append(t)
t.start()
for t in all_thread:
t.join()
完整代码
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
import threading
import requests
import http.client
import sys
lock = threading.Lock()
def getProxyList(targeturl="http://www.xicidaili.com/nn/"):
requestHeader = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
for page in range(1, 2):
url = targeturl + str(page)
print(url)
info = requests.get(url, headers=requestHeader)
html_doc = info.text
#print(html_doc)
soup = BeautifulSoup(html_doc, "lxml")
#soup = BeautifulSoup(html_doc, "html.parser")
trs = soup.find('table', id='ip_list').find_all('tr')
#print(trs)
ip_list = []
for tr in trs[1:]:
tds = tr.find_all('td')
if tds[0].find('img') is None:
nation = '未知'
locate = '未知'
else:
nation = tds[0].find('img')['alt'].strip()
locate = tds[3].text.strip()
ip = tds[1].text.strip() # ip
port = tds[2].text.strip() # 端口
anony = tds[5].text.strip()
protocol = tds[6].text.strip()
speed = tds[7].find('div')['title'].strip()
time = tds[9].text.strip()
ip_list.append("%s:%s" % (ip, port))
return ip_list[1:]
def getProxyList_2(targeturl="http://www.66ip.cn/"):
countNum = 0
requestHeader = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
for page in range(1, 2):
url = targeturl + str(page)
print(url)
info = requests.get(url)#, headers=requestHeader)
#print(requests.utils.get_encodings_from_content(info.text))
info.encoding = 'gb2312'
html_doc = info.text
#print(html_doc)
soup = BeautifulSoup(html_doc,"lxml")
#soup = BeautifulSoup(html_doc, "html.parser")
#print(soup)
trs = soup.findAll('table')[2].find_all('tr')
#print(trs)
ip_list = []
for tr in trs:
tds = tr.find_all('td')
#print(tds)
ip = tds[0].text.strip() # ip
port = tds[1].text.strip()
ip_list.append("%s:%s"%(ip,port))
return ip_list[1:]
def ProxyList(lists):
'''
验证代理的有效性
'''
requestHeader = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
myurl = 'http://www.baidu.com/'
for i in lists:
lock.acquire()
ip = i.split(":")[0]
port = i.split(":")[1]
try:
requests.get(myurl, proxies={"http": "http://%s:%s"%(ip,port)})
print("Success:" + ip + ":" + port)
except:
print(ip,port+' 不可用')
lists.remove(i)
lock.release()
print(lists)
return lists
def ProxyLists(lists):
'''
验证代理的有效性
'''
requestHeader = {
'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.80 Safari/537.36"}
myurl = 'http://www.baidu.com/'
lock.acquire()
ip = lists.split(":")[0]
port = lists.split(":")[1]
try:
requests.get(myurl, proxies={"http": "http://%s:%s"%(ip,port)})
print("Success:" + ip + ":" + port)
lists_ip.append({ip:port})
except:
print(ip,port+' 不可用')
lock.release()
print(lists_ip)
return lists_ip
if __name__=="__main__":
lists1 = getProxyList_2()
lists2 = getProxyList()
lists = lists1 + lists2
lists_ip = []
all_thread = []
for i in range(0,len(lists)):
t = threading.Thread(target=ProxyLists,args=(lists[i],))
all_thread.append(t)
t.start()
for t in all_thread:
t.join()
本文介绍如何使用Python从网站如xicidaili.com和66ip.cn抓取代理IP信息,利用BeautifulSoup解析网页,通过多线程验证代理的有效性,并创建一个可用的代理IP池。
923

被折叠的 条评论
为什么被折叠?



