# -*- coding: utf-8 -*-
import scrapy
from ..items import BokeItem
class BkSpider(scrapy.Spider):
name = 'bk'
# allowed_domains = ['bai.com']
start_urls = ['https://www.oschina.net/blog']
def img_parse(self, response):
item = response.meta['item']
with open('./img/%s' % item['imgname'], 'wb') as f:
f.write(response.body)
yield item
def blog_parse(self, response):
typ = response.meta['typ']
div_list = response.xpath('//div[contains(@class,"blog-item")]')
for div in div_list:
try:
title = div.xpath('./div/a/@title').extract_first()
brief = div.xpath('./div/div[1]/p/text()').extract_first()
author = div.xpath('.//div[@class="extra"]//div[@class="item"][1]/a/text()').extract_first()
date = div.xpath('.//div[@class="extra"]//div[@class="item"][2]/text()').extract_first()
readt = div.xpath('.//div[@class="extra"]//div[@class="item"][3]/text()').extract_first()
comment = div.xpath('.//div[@class="extra"]//div[@class="item"][4]/a/text()').extract_first()
like = div.xpath('.//div[@class="extra"]//div[@class="item"][5]/text()').extract_first()
imgurl = div.xpath('./a/img/@src').extract_first()
if imgurl.endswith('.jpg') or imgurl.endswith('.png') or imgurl.endswith('.gif'):
imgname = imgurl.split('/')[-1]
item = BokeItem()
item['title'] = title
item['brief'] = brief
item['author'] = author
item['date'] = date
item['readt'] = readt
item['comment'] = comment
item['like'] = like
item['typ'] = typ
item['imgname'] = imgname
yield scrapy.Request(url=imgurl, callback=self.img_parse, meta={'item': item})
except:
continue
def parse(self, response):
typ_url = response.xpath('//*[@id="mainScreen"]/div/div[1]/div/div[1]/div[1]/a[position()>1]/@href').extract()
typ_text = response.xpath('//*[@id="mainScreen"]/div/div[1]/div/div[1]/div[1]/a[position()>1]/text()').extract()
for durl in typ_url:
yield scrapy.Request(url=durl, callback=self.blog_parse, meta={'typ': typ_text[typ_url.index(durl)]})
在pipelines.py中配置mongdb数据库
import pymongo
class BokePipeline(object):
conn = pymongo.MongoClient()
db = conn.库名
table = db.表名
def process_item(self, item, spider):
self.table.insert_one(dict(item))
return item
在items.py中配置需要的字段
import scrapy
class BokeItem(scrapy.Item):
# define the fields for your item here like:
title = scrapy.Field()
brief = scrapy.Field()
author = scrapy.Field()
date = scrapy.Field()
readt = scrapy.Field()
comment = scrapy.Field()
like = scrapy.Field()
typ = scrapy.Field()
imgname = scrapy.Field()