环境:Python2.7,requests,bs4,re,
获取数据的网址:西刺代理
得到的结果,后续可自行构建代理池或者保存文件:
代码段:
#coding=utf8
import requests
from bs4 import BeautifulSoup
import re
import os.path
user_agent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_5)'
headers = {'User-Agent': user_agent}
def getListProxies():
session = requests.session()
page = session.get("http://www.xicidaili.com/nn", headers=headers)
soup = BeautifulSoup(page.text, 'lxml')
proxyList = []
taglist = soup.find_all('tr', attrs={'class': re.compile("(odd)|()")})
for trtag in taglist:
tdlist = trtag.find_all('td')
proxy = {'http': tdlist[1].string + ':' + tdlist[2].string,