spider定义如何从站点爬取
class MySpider(scrapy.Spider):
name = 'myspider'
def start_requests(self):
return [scrapy.FormRequest("http://www.example.com/login",
formdata={'user': 'john', 'pass': 'secret'},
callback=self.logged_in)]
def logged_in(self, response):
# here you would extract links to follow and return Requests for
# each of them, with another callback
pass
name #定义spider的名字,并搜索到它,必须是唯一的
allowed_domains #允许的域名
start_urls #定义爬取的URL地址,无特别指明,从这个序列地址开始
custom_setting # 优先于项目的默认设置
- crawler # from_crawler 类中的方法
setting #配置
logger #日志
from_crawler(crawler, *args, **kwargs) #创建spider
start_requests() #访问start_url中的地址,并返回Response给回调函数(callback) ,默认执行
parse(response) # 默认的回调函数,也可定义其他的回调函数 ,对返回的Response进行处理,提取数据等
log(message[,level,component]) # logger中的日志
closed(reason) #当关闭spider时调用
example :
import scrapy
class MySpider(scrapy.Spider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/1.html',
'http://www.example.com/2.html',
'http://www.example.com/3.html',
]
def parse(self, response):
self.logger.info('A response from %s just arrived!', response.url)
import scrapy
class MySpider(scrapy.Spider):
name = 'example.com'
allowed_domains = ['example.com']
start_urls = [
'http://www.example.com/1.html',
'http://www.example.com/2.html',
'http://www.example.com/3.html',
]
def parse(self, response):
for h3 in response.xpath('//h3').extract():
yield {"title": h3}
for url in response.xpath('//a/@href').extract():
yield scrapy.Request(url, callback=self.parse)
import scrapy
from myproject.items import MyItem
class MySpider(scrapy.Spider):
name = 'example.com'
allowed_domains = ['example.com']
def start_requests(self):
yield scrapy.Request('http://www.example.com/1.html', self.parse)
yield scrapy.Request('http://www.example.com/2.html', self.parse)
yield scrapy.Request('http://www.example.com/3.html', self.parse)
def parse(self, response):
for h3 in response.xpath('//h3').extract():
yield MyItem(title=h3)
for url in response.xpath('//a/@href').extract():
yield scrapy.Request(url, callback=self.parse)