糗事百科段子之scrapy爬虫
前期工作
- 创建项目:进入cmd中,切换路径到某个工作目录下,创建项目
scrapy startproject 项目名称
- 创建爬虫文件:首先到上述创建项目的目录下
cd 项目名称
,scrapy genspider 爬虫的名称 网站域名
- 对于出现的多个py文件内容如下
qsbk_spider.py
# -*- coding: utf-8 -*-
'''
response是一个‘scrapy.http.response.html.HtmlResponse’对象,可以执行‘xpath’,‘css’语法提取数据
提取出的数据,是‘Selector’或‘SelectorList’对象。如果想获取其中的字符串。则执行get或getall方法
getall获取selector的所有文本,返回一个列表
get获取selector的第一个文本,返回一个str
pipeline:保存数据,其中三个方法常用到:open_spider(self,spider)、process_item(self,item,spider)、close_spider(self,spider)
在setting.py中激活pipline
'''
import scrapy
from scrapy.selector.unified import SelectorList
from qsbk.items import QsbkItem
class QsbkSpiderSpider(scrapy.Spider):
name = 'qsbk_spider'
allowed_domains = ['qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/text/page/1/']
base_domin = "https://www.qiushibaike.com"
def parse(self, response):
# SelectorList
duanzis=response.xpath("//div[@id='content-left']/div")
for duanzi in duanzis:
# selector
author = duanzi.xpath(".//h2/text()").get().strip()
content = duanzi.xpath(".//div[@class='content']/span//text()").getall()#返回列表
content = "".join(content).strip()
item = QsbkItem(author=author,content=content)
yield item
next_url = response.xpath("//ul[@class='pagination']/li[last()]/a/@href").get()
if not next_url:#当next_url为False时,return
return
else:
yield scrapy.Request(self.base_domin+next_url,callback=self.parse)
import scrapy
class QsbkItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
# -*- coding: utf-8 -*-
'''
保存json数据时,使用以下两个类,操作更简单
1.JsonItemExporter:每次把数据添加到内存中,最后统一写入磁盘中。优点是存储的数据满足json规则,缺点是不适合数据量较大的数据
2.JsonLinesItemExporter:每次调用export_item时把item存储到硬盘中。优点是适合处理数据量大的情况,数据也比较安全,缺点是每一个字典一行,整个文件不满足json格式
'''
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
# =============================================================================
# import json
# class QsbkPipeline(object):
# def __init__(self):
# #也可以在open_spider中
# self.fp = open("duanzi.json","w",encoding='utf-8')
#
# def open_spider(self,spider):
# print('爬虫开始。。。')
# def process_item(self, item, spider):
# item_json=json.dumps(dict(item),ensure_ascii=False)
# self.fp.write(item_json+'\n')
# return item
# def close_spider(self,spider):
# self.fp.close()
# print('爬虫结束。。。')
#
# =============================================================================
# =============================================================================
# from scrapy.exporters import JsonItemExporter
#
# class QsbkPipeline(object):
# def __init__(self):
# #也可以在open_spider中
# self.fp = open("duanzi.json","wb")
# self.exporter = JsonItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
# self.exporter.start_exporting()
#
# def open_spider(self,spider):
# print('爬虫开始。。。')
#
# def process_item(self, item, spider):
# self.exporter.export_item(item)
# return item
#
# def close_spider(self,spider):
# self.exporter.finish_exporting()
# self.fp.close()
# print('爬虫结束。。。')
#
# =============================================================================
from scrapy.exporters import JsonLinesItemExporter
class QsbkPipeline(object):
def __init__(self):
#也可以在open_spider中
self.fp = open("duanzi.json","wb")
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
def open_spider(self,spider):
print('爬虫开始。。。')
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.fp.close()
print('爬虫结束。。。')
# -*- coding: utf-8 -*-
# Scrapy settings for qsbk project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'qsbk'
SPIDER_MODULES = ['qsbk.spiders']
NEWSPIDER_MODULE = 'qsbk.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'qsbk (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'qsbk.middlewares.QsbkSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'qsbk.middlewares.QsbkDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'qsbk.pipelines.QsbkPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
微信小程序爬虫之CrawlSpider
wxapp_spider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wxapp.items import WxappItem
class WxappSpiderSpider(CrawlSpider):
name = 'wxapp_spider'
allowed_domains = ['wxapp-union.com']
start_urls = ['http://www.wxapp-union.com/portal.php?mod=list&catid=2&page=1']
rules = (
Rule(LinkExtractor(allow=r'.+mod=list&catid=2&page=\d'), follow=True),
Rule(LinkExtractor(allow=r'.+article-.+\.html'),callback='parse_item',follow=False)
)
def parse_item(self, response):
title=response.xpath("//h1[@class='ph']/text()").get()
author_p = response.xpath("//p[@class='authors']")
author = author_p.xpath(".//a/text()").get()
pub_time = author_p.xpath(".//span/text()").get()
content = response.xpath("//td[@id='article_content']//text()").getall()
content = "".join(content).strip()
item = WxappItem(title=title,author=author,pub_time=pub_time,content=content)
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class WxappItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
author = scrapy.Field()
pub_time = scrapy.Field()
content = scrapy.Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
class WxappPipeline(object):
def __init__(self):
self.fp = open('wxjc.json','wb')
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_spider(self,spider):
self.fp.close()
# -*- coding: utf-8 -*-
# Scrapy settings for wxapp project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'wxapp'
SPIDER_MODULES = ['wxapp.spiders']
NEWSPIDER_MODULE = 'wxapp.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'wxapp (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36",
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'wxapp.middlewares.WxappSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'wxapp.middlewares.WxappDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wxapp.pipelines.WxappPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
51jobpython岗位爬虫之CrawlSpider
wyjob_spider.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from wyjob.items import WyjobItem
class WyjobSpiderSpider(CrawlSpider):
name = 'wyjob_spider'
allowed_domains = ['51job.com']
start_urls = ['https://search.51job.com/list/000000,000000,0000,00,9,99,python,2,1.html']
rules = (
Rule(LinkExtractor(allow=r'.+list/000000,000000,0000,00,9,99,python,2,\d\.html'), follow=True),
Rule(LinkExtractor(allow=r'https://jobs.51job.com/.+\d\.html\?s=01&t=0'),callback='parse_item',follow=False)
)
def parse_item(self, response):
title = response.xpath("//div[@class='cn']/h1/text()").get().strip()
salary = response.xpath("//div[@class='cn']/strong/text()").get()
company = response.xpath("//p[@class='cname']/a/text()").get().strip()
base_info = response.xpath("//p[@class='msg ltype']//text()").getall()#列表
base_info ="".join(base_info)#列表转字符串
location = base_info.split("|")[0].strip()
workyears = base_info.split("|")[1].strip()
education = base_info.split("|")[2].strip()
position_info = response.xpath("//div[@class='bmsg job_msg inbox']/p//text()").getall()
position_info = "".join(position_info).strip()
item = WyjobItem(title=title,salary=salary,company=company,location=location,workyears=workyears,education=education,position_info=position_info)
yield item
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class WyjobItem(scrapy.Item):
title = scrapy.Field()
salary = scrapy.Field()
company = scrapy.Field()
location = scrapy.Field()
workyears = scrapy.Field()
education = scrapy.Field()
position_info = scrapy.Field()
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.exporters import JsonLinesItemExporter
class WyjobPipeline(object):
def __init__(self):
self.fp =open('wyjob.json',"wb")
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_item(self,spider):
self.fp.close()
# -*- coding: utf-8 -*-
# Scrapy settings for wyjob project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'wyjob'
SPIDER_MODULES = ['wyjob.spiders']
NEWSPIDER_MODULE = 'wyjob.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'wyjob (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
DOWNLOAD_DELAY = 1
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'wyjob.middlewares.WyjobSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'wyjob.middlewares.WyjobDownloaderMiddleware': 543,
#}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'wyjob.pipelines.WyjobPipeline': 300,
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'