如何使用ip代理爬虫

import urllib
import socket
import urllib2
import time
from bs4 import BeautifulSoup


url = 'http://www.xicidaili.com/nn/'
target="https://msdn.microsoft.com"
dirt={}
proxy = {'http': '223.15.151.149:8888'}
proxy_support = urllib2.ProxyHandler(proxy)
# opener = urllib2.build_opener(proxy_support,urllib2.HTTPHandler(debuglevel=1))
opener = urllib2.build_opener(proxy_support)
urllib2.install_opener(opener)

# 添加头信息,模仿浏览器抓取网页,对付返回403禁止访问的问题
# i_headers = {'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
i_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.48'}


f = open("proxy.txt","w")

for i in range(1,1504):
	new_url=url+str(i)
	print new_url
	time.sleep(3)
	req = urllib2.Request(new_url, headers=i_headers)
	html = urllib2.urlopen(req).read()
	soup=BeautifulSoup(html,"html.parser")
	#print soup.body
	ips = soup.find_all('tr')
	#print ips

	for x in range(1,len(ips)):
	    ip = ips[x]
	    tds = ip.find_all("td")
	    #print tds[1].text,tds[2].text
	    dirt[tds[1].text]=tds[2].text
	    f.write(tds[1].text+":"+tds[2].text+"\n")
print len(dirt)
socket.setdefaulttimeout(3)




评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值