scrapy爬取《坏蛋是怎样练成的4》-优快云博客

本文介绍了使用Scrapy爬虫框架抓取网站内容的过程，并详细展示了如何解析网页、提取所需信息，以及将数据保存到本地文件和MySQL数据库的方法。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

scrapy具体介绍就不用说了，自己百度一下。或者参考以下文档

https://blog.youkuaiyun.com/u011054333/article/details/70165401

直接在cmd里运行

scrapy startproject huaidan

scrapy genspider huaidan huaida4.com

然后贴代码放到spiders文件夹里

 1 # -*- coding: utf-8 -*-
 2 import scrapy
 3 from scrapy.http import Request
 4 from urllib import parse
 5 import re
 6 
 7 class huaidan(scrapy.Spider):
 8     name = "huaidan"
 9     allowed_domains = ["www.huaidan4.com"]
10     start_urls = ["http://www.huaidan4.com/di-yi-fen-lei.html",
11                   "http://www.huaidan4.com/di-er-juan.html",
12                   "http://www.huaidan4.com"]
13 
14 
15     #提取下一页文章url交给scrpy进行下载
16     def parse(self, response):
17         #获取文章url
18         all_article=response.css('.container ul li a::attr(href)').extract()
19         all_url=[]
20         for article_url in all_article:
21             if article_url in all_url:
22                 pass
23             else:
24                 all_url.append(article_url)
25                 yield Request(url=article_url,encoding='utf-8',callback=self.parse_detail)
26 
27 
28 
29 
30     #提取文章的具体字段
31     def parse_detail(self,response):
32         #获取文章标题
33         article_title = response.xpath('//*[@id="content"]/div[1]/div[1]/h2/text()').extract_first()
34 
35         #获取创建时间
36         create_time = response.xpath('//*[@id="content"]/div[1]/div[1]/span/text()[2]').extract_first().strip()
37 
38         #获取文章正文
39         article_text = response.css('.post_entry,p::text').extract_first()
40         #处理正文标点符号和无用的信息
41         article_text = re.sub('</?\w+[^>]*>','',article_text)
42         article_text = article_text.replace("\', \'","")
43         article_text = article_text.replace("\\u3000","").strip()
44         article_text = article_text.replace("\\xa0\\xa0\\xa0\\xa0","")
45         article_text = article_text.replace("(新书上传，求收藏，推荐!!!!!!!!!!!!!!!!!!!!)","")
46         article_text = article_text.replace("\\r\\n", "\n")
47         article_text = article_text.replace("免费小说", "")
48         article_text = article_text.replace("www.huaidan4.com", "")
49         article_text = article_text.replace("neirong_2();", "")
50         article_text = article_text.replace("dibutuijian();", "")
51         article_text = article_text.replace("◎欢迎参与讨论，请在这里发表您的看法、交流您的观点。", "")
52         article_text = article_text.replace("《坏蛋是怎样炼成的4》是继曹三少坏蛋是怎样炼成的3的又一作品，作者是曹三少，如果你喜欢坏蛋是怎样炼成的4，请收藏本站以便下次阅读。","")
53         article_text = re.sub('/?\s+', '', article_text)
54 
55         #保存文件
56         self.save_article(article_title,create_time,str(article_text))
57 
58     #保存文件的方法
59     def save_article(self,article_title,create_time,article_text):
60         biaoti = re.sub('\W+','-',article_title)
61         with open(biaoti+'.txt','w',encoding='utf-8') as file:
62             neirong = (article_title+'\n'+create_time+'\n'+article_text)
63             file.write(neirong)
64             file.close()

以上内容初步完成了把文章保存在本地

---------------------------------------------------------------------------------------------------------------------------------------------------------------

下面内容完成把文章保存到mysql数据库

items.py负责存放爬取节点数据

import scrapy


class HuaidanItem(scrapy.Item):
    catalogues=scrapy.Field()
    id=scrapy.Field()
    article_title = scrapy.Field()
    article_text = scrapy.Field()
    create_time = scrapy.Field()

piplines负责处理items里的内容

# -*- coding: utf-8 -*-

import pymysql
from twisted.enterprise import adbapi

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#from scrapy.pipelines.images import ImagesPipeline
#from scrapy.pipelines.files import FilesPipeline


class HuaidanPipeline(object):
    def process_item(self, item, spider):
        return item

#直接插入到mysql数据库
class MysqlPiplines(object):
    def __init__(self):
        self.db=pymysql.connect(host="192.168.7.5",user="huaidan",password="huaidan123",database="huaidan",charset = 'utf8')
        self.cursor=self.db.cursor()

    def process_item(self, item, spider):
        self.insert(item["catalogues"],int(item["id"]),item["article_title"],item["create_time"],item["article_text"])
        return item

    def insert(self,catalogues,id,article_title,create_time,article_text):
        selectsql="select id from diyijuan where id = %d " \
                  " union select id from dierjuan where id =%d" \
                  " union select id from disanjuan where id =%d" \
                  " union select id from other where id =%d " % (id,id,id,id)
        self.cursor.execute(selectsql)
        if self.cursor.fetchone() is None:
            insertsql="insert into %s values (%d,'%s','%s','%s');" % (catalogues,id,article_title,create_time,article_text)
            try:
                self.cursor.execute(insertsql)
                self.db.commit()
            except:
                self.db.rollback()

    def spider_closed(self,spider):
        self.db.close()


#异步插入到mysql数据库
class MysqlTwisted(object):
    def __init__(self,dbpool):
        self.dbpool = dbpool

    @classmethod
    def from_settings(cls,settings):
        dbparms = dict(
            host = settings["MYSQL_HOST"],
            user = settings["MYSQL_USER"],
            passwd = settings["MYSQL_PASSWORD"],
            db = settings["MYSQL_DBNAME"],
            charset = 'utf8',
            cursorclass = pymysql.cursors.DictCursor,
            use_unicode = True,
        )

        dbpool=adbapi.ConnectionPool("pymysql", **dbparms)
        return cls(dbpool)

    # 使用twisted讲mysql插入变成异步执行
    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self.do_insert,item)
        query.addErrback(self.handle_error)

    # 处理异步插入异常
    def handle_error(self,faileure):
        print(faileure)



    # 执行具体的插入
    def do_insert(self,cursor,item):
        #查询id是否已经存在
        id=int(item["id"])
        selectsql = "select id from diyijuan where id = %d " \
                    " union select id from dierjuan where id =%d" \
                    " union select id from disanjuan where id =%d" \
                    " union select id from other where id =%d " % (id,id,id,id)
        cursor.execute(selectsql)
        #如果执行不成功，代表不存在数据库。则执行插入步骤
        if cursor.fetchone() is None:
            insertsql = "insert into %s values (%d,'''%s''','''%s''','''%s''');" % (
            item["catalogues"], id, item["article_title"], item["create_time"], item["article_text"])

            cursor.execute(insertsql)







class myarticlepipline(object):
        def process_item(self, item, spider):

            return item

settings.py负责存放整体设置

# -*- coding: utf-8 -*-
import os
# Scrapy settings for huaidan project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'huaidan'

SPIDER_MODULES = ['huaidan.spiders']
NEWSPIDER_MODULE = 'huaidan.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'huaidan (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
#    'huaidan.middlewares.HuaidanSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
#    'huaidan.middlewares.HuaidanDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
#下面内容代表执行piplines动作顺序，数字越小，越先执行。
ITEM_PIPELINES = {
   #'huaidan.pipelines.HuaidanPipeline': 300,
    #'scrapy.pipelines.files.FilesPipeline':2,
    #'huaidan.pipelines.myarticlepipline':1,
    #'huaidan.pipelines.MysqlPiplines':2,    #直接插入到mysql数据库的方法
    'huaidan.pipelines.MysqlTwisted':1,     #异步插入到mysql数据库的方法
}
project_dir = os.path.abspath(os.path.dirname(__file__))
FILES_URLS=FIELD =""
FILES_STORE = os.path.join(project_dir,'files')

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

#把数据库信息存放到settings里可以直接调用
MYSQL_HOST = "192.168.7.5"
MYSQL_DBNAME = "huaidan"
MYSQL_USER = "huaidan"
MYSQL_PASSWORD = "huaidan123"

转载于:https://www.cnblogs.com/guoyabin/p/9109933.html