mysql下载安装就不说了,自己搜搜就行了,scrapy项目创建前面也说过了,直接上代码
sprider文件,从网页获取下一页网址然后递归无限爬取
import scrapy
import requests
from lxml import etree
from ..items import FirstpaItem
class TestSpider(scrapy.Spider):
name = 'Test'
allowed_domains = ['read.qidian.com']
start_urls = ['https://read.qidian.com/chapter/5YAQ3XblbtNqqtWmhQLkJA2/DJ0272ECn7hp4rPq4Fd4KQ2']
def parse(self, response):
titles=response.xpath("//h3/span[contains(text(),\"第\")]/text()").extract()
contents=response.xpath("//div/p/text()").extract()
next_url = response.xpath("//a[contains(text(),\"下一章\")]/@href").extract()
item = FirstpaItem()
item["title"] = titles
item["content"] = contents
yield item
if next_url:
next_url = "https:" + str(next_url[0])
yield scrapy.Request(url=next_url, callback=self.parse)
items文件
import scrapy
class FirstpaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
piplines文件
from mysql import connector
class FirstpaPipeline:
def open_spider(self, spider):
self.db = connector.connect(host="localhost", user="root", passwd="root")
self.db_cursor = self.db.cursor()
def process_item(self, item, spider):
item_content=""
item_name=item["title"][0]
for s in item["content"]:
item_content+=s
item_content=item_content.replace("共{{commentTotal}}条帖子已显示全部还没有人发表评论还没有人发表评论已显示全部点击书签后,可收藏每个章节的书签,“阅读进度”可以在个人中心书架里查看","")
item_content=item_content.replace("\u3000\u3000","\n ")
sql="insert txt.content(name,content) values(%s,%s)"
self.db_cursor.execute(sql,(item_name,item_content))
self.db.commit()
# 结束爬虫时,执行一次
def close_spider(self, spider):
self.db.close()