爬虫scrapy入门
好久没有搞爬虫了,今天将最重要的一段代码贴上,以后学习。
# -*- coding: utf-8 -*-
import scrapy
from properties.items import PropertiesItem
from scrapy.loader import ItemLoader
from scrapy.http import Request
import urlparse
class BasicSpider(scrapy.Spider):
name = "manual"
allowed_domains = ["www"]
start_urls = (
'http://www.meizitu.com/a/list_1_2.html',
)
def parse_item(self, response):
#item = PropertiesItem()
#self.log("img_url %s" % response.xpath('//*[@id="picture"]/p/img/@src').extract())
#item['img_url'] = response.xpath('//*[@id="picture"]/p/img/@src').extract()
#return item
l = ItemLoader(item = PropertiesItem(), response=response)
l.add_xpath('img_url','//*[@id="picture"]/p/img/@src')
return l.load_item()
def parse(self,response):
# Get the next index URLs and yield Requests
next_selector = response.xpath('//*[@id="wp_page_numbers"]/ul/li[18]/a/@href')
for url in next_selector.extract():
yield Request(urlparse.urljoin(response.url,url))
# Get item URLs and yield Requests
item_selector = response.xpath('//*[@id="maincontent"]/div[1]/ul/li/div/div/a/@href')
for url in item_selector.extract():
yield Request(url,callback=self.parse_item)