1.爬取博客园中每条新闻的标题和url,在cnblog.py中写入操作内容
import scrapy
import sys
import io
from..items import cnlogsItem
from scrapy.selector import Selector
sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding="utf-8")
class CnblogsSpider(scrapy.Spider):
name = 'cnblogs'
allowed_domains = ['cnblogs.com']
start_urls = ['http://cnblogs.com/']
def parse(self, response):
line = Selector(response=response).xpath('//div[@id="post_list"]//div[@class="post_item_body"]')
# href = Selector(response=response).xpath('//div[@id="post_list"]//div[@class="post_item_body"]/h3/a[@class="titlelnk"]/@href').extract()
items = []
for node in line:
title = node.xpath('./h3/a[@class="titlelnk"]/text()').extract()