例子:读取代理ip列表
知识点
- beautifulsoup4
- urllib & urllib2 & httplib2
- 网络请求
- 文件操作
#encoding=utf-8
# from fib import fibonacci
# 这个声明不会把整个fib模块导入到当前的命名空间中,它只会将fib里的fibonacci单个引入到执行这个声明的模块的全局符号表
from bs4 import BeautifulSoup
import urllib2
import urllib
# install beautifulsoup4的步骤
# curl http://www.crummy.com/software/BeautifulSoup/bs4/download/4.1/beautifulsoup4-4.1.2.tar.gz >> beautifulsoup4-4.1.2.tar.gz
# tar zxvf beautifulsoup4-4.1.2.tar.gz
# cd beautifulsoup4-4.1.2
# python setup.py install
# urllib2和urllib的区别(http://blog.youkuaiyun.com/dolphin_h/article/details/45296353)
# urllib2可以接受一个Request类的实例来设置URL请求的headers,urllib仅可以接受URL。这意味着,你不可以伪装你的User Agent字符串等
# url = 'http://www.someserver.com/cgi-bin/register.cgi'
# user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'# 将user_agent写入头信息
# values = {'name' : 'who','password':'123456'}
# headers = { 'User-Agent' : user_agent }
# data = urllib.urlencode(values) #{'wd':'D_in'} => wd=D_in
# req = urllib2.Request(url, data, headers)
# response = urllib2.urlopen(req)
# the_page = response.read()
# url = r'http://www.renren.com/ajaxLogin'
#
# #创建一个cj的cookie的容器
# cj = cookielib.CookieJar()
# opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
# #将要POST出去的数据进行编码
# data = urllib.urlencode({"email":email,"password":pass})
# r = opener.open(url,data)
# print cj
# urllib提供urlencode方法用来GET查询字符串的产生,而urllib2没有。这是为何urllib常和urllib2一起使用的原因。
# 如果你仅做HTTP相关的,看一下httplib2,比其他几个模块好用
# def sendhttp():
# data = urllib.urlencode({'@number': 12524, '@type': 'issue', '@action': 'show'})
# headers = {"Content-type": "application/x-www-form-urlencoded",
# "Accept": "text/plain"}
# conn = httplib.HTTPConnection('bugs.python.org')
# conn.request('POST', '/', data, headers)
# httpres = conn.getresponse()
# print httpres.status
# print httpres.reason
# print httpres.read()
#
# if __name__ == '__main__':
# sendhttp()
of = open('proxy2.txt', 'w')
for page in range(1, 2):
print "start", page
# 伪装成浏览器进行访问
headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
req = urllib2.Request(url='http://www.xici.net.co/nn', headers=headers)
resp = urllib2.urlopen(req)
# req = urllib.urlopen('http://www.xici.net.co/nn')
# Accept:text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8
# User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/43.0.2357.124 Safari/537.36
html_doc = resp.read()
# 如果源码中中文为乱码,可以使用:print data.decode('u8')
print 'response code:', resp.getcode()
print 'response header:', resp.info()
print 'resp content:', html_doc
# html_doc = urllib2.urlopen('http://www.xici.net.co/nn/' + str(page)).read()
# print html_doc
# 数据内容(摘要):
# <table id="ip_list">
# <tr>
# <th></th>
# <th>国家</th>
# <th>IP地址</th>
# <th>端口</th>
# <th>位置</th>
# <th>是否匿名</th>
# <th>类型</th>
# <th>速度</th>
# <th>连接时间</th>
# <th>验证时间</th>
# </tr>
# ....
# </table>
soup = BeautifulSoup(html_doc)
trs = soup.find('table', id='ip_list').find_all('tr') #id为ip_list的table,查找其中的tr
print "trs"
for tr in trs[1:]:
# <tr class="">
# <td></td>
# <td><img alt="Cn" src="http://fs.xicidaili.com/images/flag/cn.png"/></td>
# <td>114.105.216.240</td>
# <td>6675</td>
# <td>
# 安徽亳州
# </td>
# <td>高匿</td>
# <td>socks4/5</td>
# <td>
# <div class="bar" title="1.413秒">
# <div class="bar_inner fast" style="width:86%">
# </div>
# </div>
# </td>
# <td>
# <div class="bar" title="0.412秒">
# <div class="bar_inner fast" style="width:92%">
# </div>
# </div>
# </td>
# <td>13-01-09 15:11</td>
# </tr>
print "----------------------------------------\n", tr
tds = tr.find_all('td')
ip = tds[2].text.strip() # 第2个位ip
port = tds[3].text.strip()
protocol = tds[6].text.strip()
print "=============:\n"
print tds[2].text.strip(), ":", tds[3].text.strip(), " " ,protocol
print "============="
if protocol == 'HTTP' or protocol == 'HTTPS':
of.write('%s=%s:%s\n' % (protocol, ip, port) )
print '%s=%s:%s' % (protocol, ip, port)
of.close()