from urllib import request
import random
import re
def get_html(url):#获取西刺网站的html源码
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36'
}
req = request.Request(url, headers = header)
response = request.urlopen(req)
html = response.read().decode()
return html
def get_proxy(html):#利用re,找出每个代理的ip 和端口,存入列表
firstlist = re.findall(r'<tr class=.*?>(.*?)</tr>',html,re.S)
print(len(firstlist))
for i in firstlist:
ip=re.findall(r'\d+\.\d+\.\d+\.\d+',i,re.S)[0]
port= re.findall(r'<td>(\d+)</td>',i,re.S)[0]
proxy = '{}:{}'.format(ip,port)
proxy_list.append(proxy)
return proxy_list
def get_real_proxy(proxy_list):#检查每个代理ip是否可用
url = 'http://httpbin.org/ip'#访问此网站返回ip信息,可用于检查代理是否有效
for i in range(8):
print(i)
perproxy = proxy_list[i]
print(perproxy)
proxy_suport= request.ProxyHandler({'http':perproxy})#创建代理实例
opener = request.build_opener(proxy_suport)#创建打开器
request.install_opener(opener)#安装打开器
req = request.Request(url)
try:
html = request.urlopen(req).read().decode()#对返回信息进行解码
print(html)
except Exception as e:
print(e)
print("该代理不可用")
del proxy_list[i]
return proxy_list
if __name__ == "__main__":
url = 'http://www.xicidaili.com/nn/'
proxy_list = []
html =get_html(url)
proxy_list = get_proxy(html)
for i in proxy_list:
print(i)
real_proxy = get_real_proxy(proxy_list)
print (len(real_proxy))