Spider、CrawlSpider、RedisSpider代码区别

本文介绍了三种Scrapy爬虫的实现方式:Spider类用于单一页面抓取,CrawlSpider类适用于多页面爬取并遵循一定规则,RedisSpider类则利用Redis进行任务调度,适合分布式爬虫场景。

Spider: 

class BookSpider(scrapy.Spider):
    name = 'book'
    allowed_domains = ['suning.com']
    start_urls = ['https://book.suning.com/']

    def parse(self, response):
        pass

 CrawlSpider:

# -*- coding: utf-8 -*-
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule


class KaoyanSpider(CrawlSpider):
    name = 'kaoyan'
    allowed_domains = ['kaoyan365.cn']
    start_urls = ['http://www.kaoyan365.cn/kaoyantiaoji/tiaojixinxi/158281.html']

    rules = (
        # 提取列表页的url地址
        Rule(LinkExtractor(allow=r'position\.php\?&start=\d*?#a'), callback='parse_list', follow=True),
        # 提取详情页的url地址
        # Rule(LinkExtractor(allow=r'position_detail\.php\?id=\d*?&keywords=&tid=0&lid=0'), callback='parse_item')
    )

    def parse_list(self, response):
        pass

 RedisSpider:

class BookSpider(RedisSpider):  # 继承自父类为RedisSpider
    name = 'dang'
    allowed_domains = ['dangdang.com']  # 手动指定allow_domains
    # start_urls = ['http://dangdang.com/']  # 没有start_urls
    redis_key = "dangdang"  # 增加了一个redis_key的键

    def parse(self, response):
        pass

 

import scrapy import re import json import logging from datetime import datetime from urllib.parse import urlparse from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule from nepu_spider.items import NepuSpiderItem class InfoSpider(CrawlSpider): name = 'info' allowed_domains = ['nepu.edu.cn', 'www.nepu.edu.cn'] start_urls = [ 'https://www.nepu.edu.cn/tzgg.htm', 'https://www.nepu.edu.cn/xxgk/xxjj.htm', 'https://www.nepu.edu.cn/jgsz/dzglbm.htm', 'https://www.nepu.edu.cn/jgsz/xysz.htm', 'https://www.nepu.edu.cn/rcpy1.htm', 'https://www.nepu.edu.cn/kxyj/kygk1.htm', 'https://www.nepu.edu.cn/szdw/szgk.htm', 'https://www.nepu.edu.cn/zsjy1.htm', 'https://www.nepu.edu.cn/xtgz1.htm', 'https://www.nepu.edu.cn/gjjl/xjjl.htm', 'https://www.nepu.edu.cn/xyfg.htm', 'https://www.nepu.edu.cn/xsc/xsgz.htm', 'https://www.nepu.edu.cn/xxgk/xrld.htm', 'https://www.nepu.edu.cn/jxky/jwtz.htm', ] rules = ( Rule(LinkExtractor( allow=(r'/info/\d+/\d+\.htm', r'\.htm$'), restrict_xpaths=( '//div[@class="list_right_con"]', '//div[@class="v_news_content"]', '//div[contains(@class, "main_con")]', '//div[@id="vsb_content"]' ) ), callback='parse_item', follow=True), Rule(LinkExtractor( allow=(r'/(tzgg|rcpy|kxyj|zsjy|szdw|gjjl|xyfg|xxgk|jgsz)/',), restrict_xpaths=( '//div[@class="menu"]', '//div[contains(@class, "channel")]', '//div[@class="list_right_con"]/ul/li' ) ), follow=True), ) custom_settings = { 'DEPTH_LIMIT': 4, 'CONCURRENT_REQUESTS': 16, 'DOWNLOAD_DELAY': 0.5, 'AUTOTHROTTLE_ENABLED': True, 'LOG_LEVEL': 'INFO', 'LOG_FILE': 'nepu_spider.log' } def __init__(self, *args, **kwargs): super(InfoSpider, self).__init__(*args, **kwargs) self.visited_urls = set() def start_requests(self): self.logger.info("🚀 开始爬取东北石油大学官网...") self.logger.info(f"初始URL数量: {len(self.start_urls)}") for url in self.start_urls: yield scrapy.Request(url, callback=self.parse, meta={'start_url': True}) def parse_item(self, response): if response.url in self.visited_urls: self.logger.debug(f"已访问过的URL: {response.url}") return self.visited_urls.add(response.url) def parse(self, response): # 解析当前页面中的链接 for href in response.css('a::attr(href)').getall(): url = response.urljoin(href) if url not in self.visited_urls: self.visited_urls.add(url) yield scrapy.Request(url, callback=self.parse_item) def parse_item(self, response): # 解析具体条目内容 yield { 'url': response.url, 'title': response.css('h1::text').get() } item = NepuSpiderItem() item['url'] = response.url item['source'] = '东北石油大学官网' item['depth'] = response.meta.get('depth', 0) item['response_status'] = response.status try: # 标题提取逻辑 title_selectors = ['h1::text', '.title::text', 'h2::text', 'title::text'] title = None for selector in title_selectors: title_candidate = response.css(selector).get() if title_candidate: clean_title = title_candidate.split('|')[0].split('_')[0].split('-')[0].strip() if clean_title and len(clean_title) > 3: title = clean_title break item['title'] = title or '无标题' # 日期提取逻辑 date_selectors = ['.time::text', '.date::text', '.publish-time::text', '//span[@id="publish-time"]/text()'] publish_date = None for selector in date_selectors: date_text = response.css(selector).get() if date_text: match = re.search(r'(\d{4})[年/-](\d{1,2})[月/-](\d{1,2})', date_text) if match: year, month, day = match.groups() publish_date = f"{year}-{month.zfill(2)}-{day.zfill(2)}" break item['publish_date'] = publish_date or '' # 内容提取逻辑 content_selectors = [ '//div[@id="vsb_content"]', '//div[@class="v_news_content"]', '//div[contains(@class, "article-content")]', '//div[contains(@class, "content")]', '//div[@class="TRS_Editor"]', '//div[@class="con"]', '//div[@id="zoom"]', '//div[@class="content_main"]', '//body' ] content_html = None for selector in content_selectors: content_div = response.xpath(selector) if content_div: content_html = content_div.get() if content_html and len(content_html) > 500: break if not content_html: self.logger.warning(f"⚠️ 无法提取 content: {response.url}") content_html = "" content_html = re.sub(r'<script[^>]*>.*?</script>', '', content_html, flags=re.DOTALL) content_html = re.sub(r'<style[^>]*>.*?</style>', '', content_html, flags=re.DOTALL) content_html = re.sub(r'<!--.*?-->', '', content_html, flags=re.DOTALL) content_html = re.sub(r'<iframe[^>]*>.*?</iframe>', '', content_html, flags=re.DOTALL) content_html = re.sub(r'相关附件', '', content_html) content_html = re.sub(r'\s+', ' ', content_html) item['content'] = content_html # 分类提取逻辑 breadcrumbs = response.css('.breadcrumb a::text, .weizhi a::text').getall() if breadcrumbs and len(breadcrumbs) > 1: item['category'] = ' > '.join([b.strip() for b in breadcrumbs[1:] if b.strip()]) else: path_parts = urlparse(response.url).path.split('/') category_map = { 'tzgg': '通知公告', 'xxgk': '学校概况', 'jgsz': '机构设置', 'rcpy': '人才培养', 'kxyj': '科学研究', 'szdw': '师资队伍', 'zsjy': '招生就业', 'xtgz': '团体组织', 'gjjl': '国际交流', 'xyfg': '校园法规', 'xsc': '学生工作', 'jxky': '教学科研', 'info': '信息发布', 'news': '新闻中心' } for part in path_parts: if part in category_map: item['category'] = category_map[part] break else: item['category'] = path_parts[1] if len(path_parts) > 1 else '其他' item['content_hash'] = hashlib.sha1(item['content'].encode()).hexdigest() self.logger.info(f"✅ 成功提取: {item['title']} ({response.url})") yield item except Exception as e: self.logger.error(f"❌ 解析失败: {response.url} | 错误: {str(e)}") with open('spider_errors.log', 'a', encoding='utf-8') as f: error_info = { 'url': response.url, 'error': str(e), 'timestamp': datetime.now().isoformat(), 'html': response.text[:1000] } f.write(json.dumps(error_info, ensure_ascii=False) + '\n') def closed(self, reason): self.logger.info(f"🛑 爬虫结束,原因: {reason}") self.logger.info(f"总计爬取页面: {len(self.visited_urls)}") stats = { 'total_urls': len(self.visited_urls), 'finish_time': datetime.now().isoformat(), 'finish_reason': reason } with open('spider_stats.json', 'w', encoding='utf-8') as f: json.dump(stats, f, ensure_ascii=False, indent=2)根据问题 改吧,要改后完整代码
07-07
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值