前置条件:正确配置Splash
关于spider 下
下面代码是爬取一个主目录下的爬取小说所有免费章节的内容,如果想爬取所有的
小说,那只要修改下代码即可.
import scrapy
import os
from scrapy_splash import SplashRequest
from tools.headers import UserAgent #!这个模块时个人包装的,如需使用,在下面改成浏览器的请求头即可
from hello.items import HelloItem
from bs4 import BeautifulSoup #编码问题,欢迎提出疑问
class SearchSpider(scrapy.Spider):
name = 'search'
#allowed_domains = ['helllo.com']
def start_requests(self): #yield 下面网址可更改 ,在小说的主目录下
yield SplashRequest(url='http://chuangshi.qq.com/bk/qh/AGkENV1jVjYAP1RtATYBYQ-l.html',callback=self.parse,
headers={'User-Agent':UserAgent.PC()})
def parse(self, response):
for area in response.css('.index_area>div'):
if not area.css('.topfill>h1>span'):
for li_ in area.css('.list>ul>li'):
item=HelloItem()
item['title']=li_.css('a b::text').extract_first()
item['url']=li_.css('a::attr(href)').extract_first()
yield SplashRequest(url=item['url'],args={'wait':10,},callback=self.download,headers={'User-Agent':UserAgent.PC(),})
def download(self,response):
doc=BeautifulSoup(response.body.decode('utf-8'),'lxml')
filename=doc.title.string[:-4]+'.txt'
with open(filename, 'a+', encoding='utf-8') as f:
for li in doc.find(class_='bookreadercontent').find_all('p'):
f.write(li.string + '\n')
- 此代码仅供学习使用
- 以上皆为个人理解,如有错误之处,欢迎留言指正