scrapy 框架 +MongoDB 爬取小说网站

最新推荐文章于 2024-06-17 13:03:03 发布

原创最新推荐文章于 2024-06-17 13:03:03 发布 · 828 阅读

5 ·

CC 4.0 BY-SA版权

文章标签：

#python #scrapy

python 专栏收录该内容

4 篇文章

订阅专栏

本文介绍如何使用Scrapy框架创建并运行一个爬虫项目，详细展示了从安装MongoDB数据库到编写爬虫代码的过程，包括抓取小说目录及章节内容的具体实现。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

1.先行安装MongoDB数据库，并通过mongod --dbpath "D:\mongodb\data"启动数据库。

scrapy <command> [options] [args]

Available commands:
bench         Run quick benchmark test
fetch         Fetch a URL using the Scrapy downloader
genspider     Generate new spider using pre-defined templates
runspider     Run a self-contained spider (without creating a project)
settings      Get settings values
shell         Interactive scraping console
startproject Create new project
version       Print Scrapy version
view          Open URL in browser, as seen by Scrapy

[ more ] More commands available when run from project directory

2.通过scrapy startproject +项目名，创建爬虫项目,

scrapy startproject novel

You can start your first spider with:
cd novel
scrapy genspider example example.com

里面有运行实例，可以测试一下是否创建成功

scrapy crawl example --nolog #运行测试，测试结果输出到控制台

scrapy crawl example --nolog -o a.json #测试结果输出到本地的a.json文件中

到此：项目创建结束

打开 novel/spiders/buquge.py

# -*- coding: utf-8 -*-
import scrapy
import requests
from bs4 import BeautifulSoup
import re
from ..items import SpidertestItem
from ..items import ChapterItem
import time
import os

class BuqugeSpider(scrapy.Spider):
    name = 'buquge'
    allowed_domains = ['biquge5.com']
#请求的第一个URL
    start_urls = ['http://www.biquge5.com/xiaoshuodaquan']
#请求的结果在response中，做逻辑处理
    def parse(self, response):
        # print(type(response.text))
        # time.sleep(0.5)
        # text=response.xpath('//div[@class="novellist"]/ul/li/a').extract()
        # text=bs.find_all('li')
        # text=re.findall('<a (.*?)>"(.*?)"<em>(.*?)</em>',response.text,re.S)
        text=re.findall('<a href="(.*?)">(.*?) <em>(.*?)</em>',response.text)
        # for i in text:
        #     # print(type(i))
        #     item = SpidertestItem()
        #     item['href']='http://www.biquge5.com'+i[0]
        #     item['name']=i[1]
        #     item['author']=i[2]
        #     # print(item['author'],item['name'],'******************')
        #     yield item
        # print(1111115556666622222222)
        for i in text:
            item = SpidertestItem()
            item['href']='http://www.biquge5.com'+i[0]
            item['name']=i[1]
            item['author']=i[2]
            isExists = os.path.exists("../novel")
            if not isExists:
                os.makedirs('../novel')
#返回一个Request对象，并通过callback指定URL的链接返回的结果的处理函数
            yield scrapy.Request(url='http://www.biquge5.com'+i[0],callback=self.parse_content)
#小说
    def parse_content(self,response):
        # time.sleep(0.5)
        print(111111111)
#通过自带的格式化处理器，处理页面元素信息
        novel_name=response.xpath('//*[@id="info"]/h1/text()').extract_first()
        jieshao=response.xpath('//*[@id="intro"]/p/text()')
        text=re.findall('<ul class="_chapter">(.*?)</ul>',response.text,re.S)
        chapters = re.findall('<a href="(.*?)\r\n\t\t\t(.*?)">(.*?)</a>', text[0],re.S)
        for item in chapters:
#获取items对象，保存信息
            chapter=ChapterItem()
            chapter['name']=novel_name
            chapter['href']=item[0]+item[1]
            chapter['chapterName']=item[2]
            # isExists = os.path.exists("../novel/" + novel_name)
            # if not isExists:
            #     os.makedirs('../novel/' + novel_name)
#返回信息到pipline.py中，然后保存
            yield chapter
        for item in chapters:
            yield scrapy.Request(url=item[0]+item[1],callback=self.parse_chapterContent)
#章节内容提取
    def parse_chapterContent(self,response):
        time.sleep(0.5)
        print(222222222222222)
        sort=response.css('#wrapper > div.content_read > div > div.con_top > a:nth-child(4)::text').extract_first()
        print(sort)
        novel_name=response.xpath('//*[@id="wrapper"]/div[5]/div[1]/a[3]/text()').extract_first()
        print(novel_name,'------------->>>')
        chapter_name=response.css('#wrapper h1::text').extract_first()
        print(chapter_name)
        print(type(chapter_name))
        print('/' in chapter_name)
        print('&&&&&&&&&&&&&&&&&&&&&&&&&&&')
        if '/' in chapter_name:
            print('@@@@@@@@@@############@@@@@@@@@')
            chapter_name = re.split('（', chapter_name)
            print(chapter_name[0])
            chapter_name=chapter_name[0]

        bs=BeautifulSoup(response.text,'lxml')
        content = bs.select("#content")
        print(sort, novel_name)
#在本地保存下载下来的小说章节
        #创建路劲
        isExists = os.path.exists("../novel/" + sort+'/'+novel_name)
        if not isExists:
            os.makedirs('../novel/' + sort+'/'+novel_name)
        print(sort,novel_name)


        #章节内容
        print(content[0].get_text(),'+++++++++++++++++++',type(content[0].get_text()),'>>>>>>>>>>>>>>>>>')
        with open('../novel/' + sort+'/'+novel_name+'/'+chapter_name+'.txt','a+',encoding='utf-8') as f:
            # f.write('wwwwwwww')
            f.write(content[0].get_text())

        print(response.url,"------url")
        print(type(response.url))
        # url_first=re.findall('http://(.*?)/(.*?)/(.*?).html',response.url)
        url_first=re.findall('http://(.*?).html',response.url)
        urls=re.split('/',url_first[0])
        # print(urls)
        print('http://'+urls[0]+'/'+urls[1]+'/'+urls[2]+'.html')

        url=re.findall('<a href="(.*?).html">下一章</a>',response.text)
        print(url[0])
        if '_' in url[0]:
            print('_' in url[0],url[0])
            print('http://'+urls[0]+'/'+urls[1]+'/'+url[0]+'.html')
            return scrapy.Request(url='http://'+urls[0]+'/'+urls[1]+'/'+url[0]+'.html', callback=self.parse_chapterContent)

        # print(url)
    # def parse_chapterContent2(self,response):
    #
    #     print(response.text)

当一个item被蜘蛛爬取到之后会被发送给Item Pipeline，然后多个组件按照顺序处理这个item。每个Item Pipeline组件其实就是一个实现了一个简单方法的Python类。他们接受一个item并在上面执行逻辑，还能决定这个item到底是否还要继续往下传输，如果不要了就直接丢弃。

/novel/pipline.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from scrapy.conf import settings
#接收items.py对象，然后保存到MongoDB数据库中，
class SpidertestPipeline(object):
#open_spider()  当spider执行的时候执行该方法
    def open_spider(self,spider):
        host=settings['MONGODB_HOST']
        port=settings['MONGODB_PORT']
        db_name=settings['MONGODB_DBNAME']
        client=pymongo.MongoClient(host=host,port=port)
        db=client[db_name]
        self.post=db[settings['MONGODB_DOCNAME']]
        print(333333333333333333333333333)
#处理item对象
    def process_item(self, item, spider):
        data={
            'url':item['href'],
            'name':item['name'],
            'chapterName':item['chapterName']
        }
#插入到表中
        self.post.insert(data)
        return item
    #def close_spider(spider):当spider关闭的时候执行该方法

class ChapterPipeline(object):

    def open_spider(self,spider):
        # host=settings['MONGODB_HOST']
        # port=settings['MONGODB_PORT']
        # db_name=settings['MONGODB_DBNAME']
        client=pymongo.MongoClient(host='127.0.0.1',port=27017)
        db=client['scrapy']
        self.post=db['chapter']
        print(333333333333333333333333333)
    def process_item(self, item, spider):
        print(item['href'],item['name'],item['chapterName'],'++++++++++++++')
        data2={
            'url':item['href'],
            'name':item['name'],
            'chapterName':item['chapterName']
        }
        self.post.insert(data2)
        return item

Item 是保存爬取到的数据的容器；其使用方法和 python 字典类似，并且提供了额外保护机制来避免拼写错误导致的未定义字段错误。如果item里未定义，spider所爬的数据是无法从传入进去的。

/novel/items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

#继承scrapy.Item类，
class SpidertestItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    href = scrapy.Field()
    name = scrapy.Field()
    author = scrapy.Field()

class ChapterItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    href = scrapy.Field()
    name = scrapy.Field()
    chapterName = scrapy.Field()

#settings.py 启动时，自动加载配置信息

/novel/settings.py

#启动时，自动加载配置信息

ITEM_PIPELINES = {
   'spidertest.pipelines.SpidertestPipeline': 300,
   # 'spidertest.pipelines.ChapterPipeline': 300,
}
#连接数据库，配置MongoDB数据库信息
#本地地址
MONGODB_HOST='127.0.0.1'
#端口
MONGODB_PORT=27017
#数据库名
MONGODB_DBNAME='scrapy'
#表名
MONGODB_DOCNAME='ty_wiwj'