<span style="font-size:18px;">使用scrapy框架优化爬虫,这次爬的top250,可爬取多页。</span>
<span style="font-size:18px;">主要用css和xpath来截图数据。</span>
<span style="font-size:18px;">
</span>
<span style="font-size:18px;">import scrapy
from scrapy.selector import Selector
try:
from scrapy.spiders import Spider
except:
from scrapy.spiders import BaseSpider as Spider
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
from scrapy.http import Request
from fztest.items import *
class fztestSpider(CrawlSpider):
name = "fztest"
allowed_admins = ["fztest.org"]
allowed_domains = ["movie.douban.com"]
start_urls = [
# "http://www.dmoz.org/Computers/Programming/Languages/Python/",
"https://movie.douban.com/top250"
]
rules = [
Rule(LinkExtractor(allow=("https://movie.douban.com/top250\?start=\d+.*")), #需要对?号进行转义。
follow=True,
callback='parse_item')]
def parse_item(self,response): #奇葩问题,如果我用def parse则只能取一页的,parse_item则可以爬多页。
items = []
postTitle = Selector(response).css('ol.grid_view div.item')
for selec in range(len(postTitle)):
item = fztestItem()
item['matitle'] = postTitle[selec].css("div.info div.hd a span:nth-child(1)").xpath('text()').extract()
item['mdlink'] = postTitle[selec].css("div.info a").xpath('@href').extract()
item['mepic'] = postTitle[selec].css("div.pic a img").xpath('@src').extract()
item['mbret'] = postTitle[selec].css("div.bd p:nth-child(1)").xpath("text()".strip()).extract()
item['mbret'] = item['mbret'][0].strip()
item['mcdesc'] = postTitle[selec].css("div.bd p.quote span.inq").xpath('text()').extract()
items.append(item)
return items</span>