- CrawlSpider
- 一种基于scrapy进行全站数据爬取的一种新的技术手段。
- CrawlSpider就是Spider的一个子类
- 连接提取器:LinkExtractor
- 规则解析器:Rule
- 使用流程:
- 新建一个工程
- cd 工程中
- 新建一个爬虫文件:scrapy genspider -t crawl spiderName www.xxx.com
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunCrawlPro.items import SuncrawlproItem,Detail_item
class SunSpider(CrawlSpider):
name = 'sun'
# allowed_domains = ['www.xxx.com']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']
#实例化了一个连接提取器对象
#作用:根据指定规则(allow=’正则表达式‘)进行指定连接的提取
link = LinkExtractor(allow=r'type=4&page=\d+')#获取页码连接
#获取新闻详情页的连接
link_detail = LinkExtractor(allow=r"question/\d+/\d+\.shtml")
rules = (
#将link作用到了Rule构造方法的参数1中
#作用:将连接提取器提取到的连接进行请求发送且根据指定规则对请求到的数据进行数据解析
Rule(link, callback='parse_item', follow=False),
#follow=True:将连接提取器 继续作用到 连接提取器提取到的 连接 所对应的 页面中
Rule(link_detail, callback='parse_detail'),
)
def parse_item(self, response):
#xpath表达式中是不可以出现tbody标签
tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
for tr in tr_list:
title = tr.xpath('./td[2]/a[2]/text()').extract_first()
num = tr.xpath('./td[1]/text()').extract_first()
item = SuncrawlproItem()
item['title'] = title
item['num'] = num
yield item
def parse_detail(self,response):
content = response.xpath('/html/body/div[9]/table[2]//tr[1]/td/text()').extract_first()
num = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first()
num = num.split(':')[-1]
item = Detail_item()
item['content'] = content
item['num'] = num
yield item
由于网站发生改变,故订正上述代码
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from crawlspider.items import CrawlspiderItem, DetailItem
class ClspiderSpider(CrawlSpider):
name = 'clspider'
# allowed_domains = ['www.xx.com']
start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']
# 实例化了一个连接提取器对象
# 作用:根据指定规则(allow=’正则表达式‘)进行指定连接的提取
link = LinkExtractor(allow=r'id=1&page=\d+')#获取页码的链接
detail_link = LinkExtractor(allow=r'index\?id=\d+')#获取详情页的链接
rules = (
# 将link作用到了Rule构造方法的参数1中
# 作用:将连接提取器提取到的连接进行请求发送且根据指定规则对请求到的数据进行数据解析
Rule(link, callback='parse_item', follow=True),
# follow=True:将连接提取器 继续作用到 连接提取器提取到的 连接 所对应的 页面中
Rule(detail_link, callback='parse_detail'),
)
def parse_item(self, response):
li_list = response.xpath('/html//div[2]/div[3]/ul[2]/li')
for li in li_list:
title = li.xpath('./span[3]/a/text()').extract_first()
num = li.xpath('./span[1]/text()').extract_first()
item = CrawlspiderItem()
item['title'] = title
item['num'] = num
# print(item)
yield item
def parse_detail(self, response):
content = response.xpath('/html//div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first().strip().replace("\r\n", "").replace(" ", "")
num = response.xpath('/html//div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract_first()
num = num.split(':')[-1]
item = DetailItem()
item['num'] = num
item['content'] = content
yield item
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class SuncrawlproItem(scrapy.Item):
title = scrapy.Field()
num = scrapy.Field()
class Detail_item(scrapy.Item):
content = scrapy.Field()
num = scrapy.Field()
pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
class SuncrawlproPipeline(object):
def process_item(self, item, spider):
if item.__class__.__name__ == 'Detail_item':
content = item['content']
num = item['num']
print(item)
else:
title = item['title']
num = item['num']
print(item)
return item