环境配置好了,现在就可以抓一些代理服务器并验证试试。
废话不多说,上代码。
#-*- coding: utf-8 -*-
'''
/*********************************************************************************
*Copyright(C),2003-2013,KK Studio
*FileName: ProxytxtSpider
*Author: KK
*Version: 1.0
*Date: 20130810
*Description:
*Function List:
1.scrapy get proxy
*History:
1.20130816: //check proxytxt format
**********************************************************************************/
'''
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.selector import HtmlXPathSelector
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from Proxy.items import ProxyItem
import re
class ProxycrawlerSpider(CrawlSpider):
name = 'txtproxy'
allowed_domains = ['www.cnhonkerarmy.com']
start_urls = []
urlstr = 'http://www.cnhonkerarmy.com/forum-viewthread-tid-176892-highlight-HTTP.html'
start_urls.append(urlstr)
def parse(self, response):
hxs = HtmlXPathSelector(response)
nodestr = '//div[@class="t_fsz"]/table/tr/td/text()'
addresses = hxs.select(nodestr).re('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
protocols = hxs.select(nodestr).re('@(.*);')
locations = hxs.select(nodestr).re(';(.*)')
ports = hxs.select(nodestr).re(':(.*)@')
items = []
for i in range(len(addresses)):
item = ProxyItem()
item['address'] = addresses[i]
item['protocol'] = protocols[i]
item['location'] = locations[i]
item['port'] = ports[i]
items.append(item)
return items