catalog
- 返回爬取网站的html
response.text
- 在items.py中定义爬取内容的数据结构
class QuotesItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
text = scrapy.Field()
author = scrapy.Field()
tags = scrapy.Field()
- css选择器用法
quotes = response.css('.quote')
- 输出标签中的文本内容
text = quote.css('.quote::text')
- 取标签中的第一个值
author = quote.css('.author::text').extract_first()
- 取标签中的所有值
tags = quote.css('.tags .tag::text').extract()
- 网站调试
scrapy shell quotes.toscrape.com
- 爬取内容的存储,item的定义
item = QuotesItem()#define the item as you define in items.py
text = quote.css('.text::text').extract_first()#select text tag which in this quote tag and then output the text content
author = quote.css('.author::text').extract_first()
tags = quote.css('.tags .tag::text').extract()
item['text'] = text
item['author'] = author
item['tags'] = tags
- 提取tag中的属性的值
next = response.css('.pager .next a::attr(href)').extract_first()#get the url of next page
- url拼接
url = response.urljoin(next)#quotes.toscrape.com + next
- 递归调用自己
yield scrapy.Request(url = url, callback = self.parse())
- 文件保存
scrapy crawl quotetutorial -o quotes.jl
scrapy crawl quotetutorial -o quotes.json
scrapy crawl quotetutorial -o quotes.csv
```python
scrapy crawl quotetutorial -o quotes.xml
scrapy crawl quotetutorial -o quotes.marshal
scrapy crawl quotetutorial -o quotes.pickle
- pipelines
对item进行处理
class QuotesPipeline:
def _init_(self):
self.limit = 50
def process_item(self, item, spider):
if item['text']:
if len(item['text'])>self.limit:
item['text'] = item['text'][0:self.limit].rstrip() + '...'
return item
else:
return DropItem('Missing Text')
保存到MongoDB中
class MongoPipeline(object):
def __init__(self,mongo_url,mongo_db):
self.mongo_url = mongo_url
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_url = crawler.settings.get('MONGO_URL'),
mongo_db = crawler.settings.get('MONGO_DB')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
def process_item(self,item,spider):
name = item.__class__.__name__
self.db[name].insert(dict(item))
return item
def close_spider(self,spider):
self.client.close()
在settings.py中配置一下MongoDB和PipeLine
MONGO_URL = 'localhost'
MONGO_DB = 'quotesdb'
ITEM_PIPELINES = {
'quotes.pipelines.QuotesPipeline': 300,
'quotes.pipelines.MongoPipeline': 400,
}