1.先行安装MongoDB数据库,并通过mongod --dbpath "D:\mongodb\data"启动数据库。
scrapy <command> [options] [args] Available commands: bench Run quick benchmark test fetch Fetch a URL using the Scrapy downloader genspider Generate new spider using pre-defined templates runspider Run a self-contained spider (without creating a project) settings Get settings values shell Interactive scraping console startproject Create new project version Print Scrapy version view Open URL in browser, as seen by Scrapy [ more ] More commands available when run from project directory |
2.通过scrapy startproject +项目名,创建爬虫项目,
scrapy startproject novel
You can start your first spider with: cd novel scrapy genspider example example.com |
里面有运行实例,可以测试一下是否创建成功
scrapy crawl example --nolog #运行测试,测试结果输出到控制台 scrapy crawl example --nolog -o a.json #测试结果输出到本地的a.json文件中 |
到此:项目创建结束
打开 novel/spiders/buquge.py
# -*- coding: utf-8 -*- import scrapy import requests from bs4 import BeautifulSoup import re from ..items import SpidertestItem from ..items import ChapterItem import time import os class BuqugeSpider(scrapy.Spider): name = 'buquge' allowed_domains = ['biquge5.com'] #请求的第一个URL start_urls = ['http://www.biquge5.com/xiaoshuodaquan'] #请求的结果在response中,做逻辑处理 def parse(self, response): # print(type(response.text)) # time.sleep(0.5) # text=response.xpath('//div[@class="novellist"]/ul/li/a').extract() # text=bs.find_all('li') # text=re.findall('<a (.*?)>"(.*?)"<em>(.*?)</em>',response.text,re.S) text=re.findall('<a href="(.*?)">(.*?) <em>(.*?)</em>',response.text) # for i in text: # # print(type(i)) # item = SpidertestItem() # item['href']='http://www.biquge5.com'+i[0] # item['name']=i[1] # item['author']=i[2] # # print(item['author'],item['name'],'******************') # yield item # print(1111115556666622222222) for i in text: item = SpidertestItem() item['href']='http://www.biquge5.com'+i[0] item['name']=i[1] item['author']=i[2] isExists = os.path.exists("../novel") if not isExists: os.makedirs('../novel') #返回一个Request对象,并通过callback指定URL的链接返回的结果的处理函数 yield scrapy.Request(url='http://www.biquge5.com'+i[0],callback=self.parse_content) #小说 def parse_content(self,response): # time.sleep(0.5) print(111111111) #通过自带的格式化处理器,处理页面元素信息 novel_name=response.xpath('//*[@id="info"]/h1/text()').extract_first() jieshao=response.xpath('//*[@id="intro"]/p/text()') text=re.findall('<ul class="_chapter">(.*?)</ul>',response.text,re.S) chapters = re.findall('<a href="(.*?)\r\n\t\t\t(.*?)">(.*?)</a>', text[0],re.S) for item in chapters: #获取items对象,保存信息 chapter=ChapterItem() chapter['name']=novel_name chapter['href']=item[0]+item[1] chapter['chapterName']=item[2] # isExists = os.path.exists("../novel/" + novel_name) # if not isExists: # os.makedirs('../novel/' + novel_name) #返回信息到pipline.py中,然后保存 yield chapter for item in chapters: yield scrapy.Request(url=item[0]+item[1],callback=self.parse_chapterContent) #章节内容提取 def parse_chapterContent(self,response): time.sleep(0.5) print(222222222222222) sort=response.css('#wrapper > div.content_read > div > div.con_top > a:nth-child(4)::text').extract_first() print(sort) novel_name=response.xpath('//*[@id="wrapper"]/div[5]/div[1]/a[3]/text()').extract_first() print(novel_name,'------------->>>') chapter_name=response.css('#wrapper h1::text').extract_first() print(chapter_name) print(type(chapter_name)) print('/' in chapter_name) print('&&&&&&&&&&&&&&&&&&&&&&&&&&&') if '/' in chapter_name: print('@@@@@@@@@@############@@@@@@@@@') chapter_name = re.split('(', chapter_name) print(chapter_name[0]) chapter_name=chapter_name[0] bs=BeautifulSoup(response.text,'lxml') content = bs.select("#content") print(sort, novel_name) #在本地保存下载下来的小说章节 #创建路劲 isExists = os.path.exists("../novel/" + sort+'/'+novel_name) if not isExists: os.makedirs('../novel/' + sort+'/'+novel_name) print(sort,novel_name) #章节内容 print(content[0].get_text(),'+++++++++++++++++++',type(content[0].get_text()),'>>>>>>>>>>>>>>>>>') with open('../novel/' + sort+'/'+novel_name+'/'+chapter_name+'.txt','a+',encoding='utf-8') as f: # f.write('wwwwwwww') f.write(content[0].get_text()) print(response.url,"------url") print(type(response.url)) # url_first=re.findall('http://(.*?)/(.*?)/(.*?).html',response.url) url_first=re.findall('http://(.*?).html',response.url) urls=re.split('/',url_first[0]) # print(urls) print('http://'+urls[0]+'/'+urls[1]+'/'+urls[2]+'.html') url=re.findall('<a href="(.*?).html">下一章</a>',response.text) print(url[0]) if '_' in url[0]: print('_' in url[0],url[0]) print('http://'+urls[0]+'/'+urls[1]+'/'+url[0]+'.html') return scrapy.Request(url='http://'+urls[0]+'/'+urls[1]+'/'+url[0]+'.html', callback=self.parse_chapterContent) # print(url) # def parse_chapterContent2(self,response): # # print(response.text) |
当一个item被蜘蛛爬取到之后会被发送给Item Pipeline,然后多个组件按照顺序处理这个item。每个Item Pipeline组件其实就是一个实现了一个简单方法的Python类。他们接受一个item并在上面执行逻辑,还能决定这个item到底是否还要继续往下传输,如果不要了就直接丢弃。
/novel/pipline.py
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html import pymongo from scrapy.conf import settings #接收items.py对象,然后保存到MongoDB数据库中, class SpidertestPipeline(object): #open_spider() 当spider执行的时候执行该方法 def open_spider(self,spider): host=settings['MONGODB_HOST'] port=settings['MONGODB_PORT'] db_name=settings['MONGODB_DBNAME'] client=pymongo.MongoClient(host=host,port=port) db=client[db_name] self.post=db[settings['MONGODB_DOCNAME']] print(333333333333333333333333333) #处理item对象 def process_item(self, item, spider): data={ 'url':item['href'], 'name':item['name'], 'chapterName':item['chapterName'] } #插入到表中 self.post.insert(data) return item #def close_spider(spider):当spider关闭的时候执行该方法 class ChapterPipeline(object): def open_spider(self,spider): # host=settings['MONGODB_HOST'] # port=settings['MONGODB_PORT'] # db_name=settings['MONGODB_DBNAME'] client=pymongo.MongoClient(host='127.0.0.1',port=27017) db=client['scrapy'] self.post=db['chapter'] print(333333333333333333333333333) def process_item(self, item, spider): print(item['href'],item['name'],item['chapterName'],'++++++++++++++') data2={ 'url':item['href'], 'name':item['name'], 'chapterName':item['chapterName'] } self.post.insert(data2) return item |
Item 是保存爬取到的数据的容器;其使用方法和 python 字典类似, 并且提供了额外保护机制来避免拼写错误导致的未定义字段错误。如果item里未定义,spider所爬的数据是无法从传入进去的。
/novel/items.py
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy #继承scrapy.Item类, class SpidertestItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() href = scrapy.Field() name = scrapy.Field() author = scrapy.Field() class ChapterItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() href = scrapy.Field() name = scrapy.Field() chapterName = scrapy.Field() |
#settings.py 启动时,自动加载配置信息
/novel/settings.py
#启动时,自动加载配置信息 ITEM_PIPELINES = { 'spidertest.pipelines.SpidertestPipeline': 300, # 'spidertest.pipelines.ChapterPipeline': 300, } #连接数据库,配置MongoDB数据库信息 #本地地址 MONGODB_HOST='127.0.0.1' #端口 MONGODB_PORT=27017 #数据库名 MONGODB_DBNAME='scrapy' #表名 MONGODB_DOCNAME='ty_wiwj' |