CrawlSpider深度爬取

- CrawlSpider
    - 一种基于scrapy进行全站数据爬取的一种新的技术手段。
    - CrawlSpider就是Spider的一个子类
        - 连接提取器:LinkExtractor
        - 规则解析器:Rule
    - 使用流程:
        - 新建一个工程
        - cd 工程中
        - 新建一个爬虫文件:scrapy genspider -t crawl spiderName www.xxx.com

 

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from sunCrawlPro.items import SuncrawlproItem,Detail_item

class SunSpider(CrawlSpider):
    name = 'sun'
    # allowed_domains = ['www.xxx.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']
    #实例化了一个连接提取器对象
    #作用:根据指定规则(allow=’正则表达式‘)进行指定连接的提取
    link = LinkExtractor(allow=r'type=4&page=\d+')#获取页码连接
    #获取新闻详情页的连接
    link_detail = LinkExtractor(allow=r"question/\d+/\d+\.shtml")
    rules = (
        #将link作用到了Rule构造方法的参数1中
        #作用:将连接提取器提取到的连接进行请求发送且根据指定规则对请求到的数据进行数据解析
        Rule(link, callback='parse_item', follow=False),
        #follow=True:将连接提取器 继续作用到 连接提取器提取到的 连接 所对应的 页面中
        Rule(link_detail, callback='parse_detail'),
    )

    def parse_item(self, response):
        #xpath表达式中是不可以出现tbody标签
        tr_list = response.xpath('//*[@id="morelist"]/div/table[2]//tr/td/table//tr')
        for tr in tr_list:
            title = tr.xpath('./td[2]/a[2]/text()').extract_first()
            num = tr.xpath('./td[1]/text()').extract_first()
            item = SuncrawlproItem()
            item['title'] = title
            item['num'] = num

            yield item
    def parse_detail(self,response):
        content = response.xpath('/html/body/div[9]/table[2]//tr[1]/td/text()').extract_first()
        num = response.xpath('/html/body/div[9]/table[1]//tr/td[2]/span[2]/text()').extract_first()
        num = num.split(':')[-1]
        item = Detail_item()
        item['content'] = content
        item['num'] = num
        yield item

由于网站发生改变,故订正上述代码

import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from crawlspider.items import CrawlspiderItem, DetailItem


class ClspiderSpider(CrawlSpider):
    name = 'clspider'
    # allowed_domains = ['www.xx.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']

    # 实例化了一个连接提取器对象
    # 作用:根据指定规则(allow=’正则表达式‘)进行指定连接的提取
    link = LinkExtractor(allow=r'id=1&page=\d+')#获取页码的链接

    detail_link = LinkExtractor(allow=r'index\?id=\d+')#获取详情页的链接

    rules = (
        # 将link作用到了Rule构造方法的参数1中
        # 作用:将连接提取器提取到的连接进行请求发送且根据指定规则对请求到的数据进行数据解析
        Rule(link, callback='parse_item', follow=True),
        # follow=True:将连接提取器 继续作用到 连接提取器提取到的 连接 所对应的 页面中
        Rule(detail_link, callback='parse_detail'),

    )

    def parse_item(self, response):

        li_list = response.xpath('/html//div[2]/div[3]/ul[2]/li')

        for li in li_list:
            title = li.xpath('./span[3]/a/text()').extract_first()
            num = li.xpath('./span[1]/text()').extract_first()
            item = CrawlspiderItem()
            item['title'] = title
            item['num'] = num
            # print(item)

            yield item

    def parse_detail(self, response):
        content = response.xpath('/html//div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first().strip().replace("\r\n", "").replace(" ", "")
        num = response.xpath('/html//div[3]/div[2]/div[2]/div[1]/span[4]/text()').extract_first()
        num = num.split(':')[-1]
        item = DetailItem()
        item['num'] = num
        item['content'] = content
        yield item

 

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class SuncrawlproItem(scrapy.Item):
    title = scrapy.Field()
    num = scrapy.Field()

class Detail_item(scrapy.Item):
    content = scrapy.Field()
    num = scrapy.Field()

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


class SuncrawlproPipeline(object):
    def process_item(self, item, spider):
        if item.__class__.__name__ == 'Detail_item':
            content = item['content']
            num = item['num']
            print(item)
        else:
            title = item['title']
            num = item['num']
            print(item)
        return item

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值