scrapy+mysql

mysql下载安装就不说了,自己搜搜就行了,scrapy项目创建前面也说过了,直接上代码

sprider文件,从网页获取下一页网址然后递归无限爬取

import scrapy
import requests
from lxml import etree
from ..items import FirstpaItem

class TestSpider(scrapy.Spider):
    name = 'Test'
    allowed_domains = ['read.qidian.com']
    start_urls = ['https://read.qidian.com/chapter/5YAQ3XblbtNqqtWmhQLkJA2/DJ0272ECn7hp4rPq4Fd4KQ2']

    def parse(self, response):
        titles=response.xpath("//h3/span[contains(text(),\"第\")]/text()").extract()
        contents=response.xpath("//div/p/text()").extract()
        next_url = response.xpath("//a[contains(text(),\"下一章\")]/@href").extract()

        item = FirstpaItem()
        item["title"] = titles
        item["content"] = contents
        yield item

        if next_url:
            next_url = "https:" + str(next_url[0])
            yield scrapy.Request(url=next_url, callback=self.parse)

items文件

import scrapy
class FirstpaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()
    content = scrapy.Field()

piplines文件

from mysql import connector

class FirstpaPipeline:
    def open_spider(self, spider):
        self.db = connector.connect(host="localhost", user="root", passwd="root")
        self.db_cursor = self.db.cursor()

    def process_item(self, item, spider):
        item_content=""
        item_name=item["title"][0]
        for s in item["content"]:
            item_content+=s
        item_content=item_content.replace("共{{commentTotal}}条帖子已显示全部还没有人发表评论还没有人发表评论已显示全部点击书签后,可收藏每个章节的书签,“阅读进度”可以在个人中心书架里查看","")
        item_content=item_content.replace("\u3000\u3000","\n    ")
        sql="insert txt.content(name,content) values(%s,%s)"
        self.db_cursor.execute(sql,(item_name,item_content))
        self.db.commit()
        
    # 结束爬虫时,执行一次
    def close_spider(self, spider):
        self.db.close()
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值