from scrapy.linkextractors import LinkExtractor # 提取连接
需要重新定义一个类继承自CrawlSpider
class MysinaSpider(CrawlSpider):
name = 'mysina'
allowed_domains = ['sina.com.cn']
start_urls = ['http://roll.news.sina.com.cn/news/gnxw/gdxw1/index_2.shtml']
# 规则
# allow=(正则)允许的, deny=(正则)不允许的
# callback=回调函数getparse()
# follow= 跟随如果为True就跟随
rules = [Rule(LinkExtractor(allow=("index_(\d+).shtml")), callback="getparse", follow=True)]
# 一定不能用parse(),因为parse()函数方法在CrawlSpider中已经写了,所以不能重名,如果用parse()命名的话等于说重新改写了方法,所以改成getparse或者别的都可以。
def getparse(self, response):
print(response.url)
newsList = response.xpath('//ul[@class="list_009"]/li')
for news in newsList:item = items.SinaItem()
newsTitle = news.xpath('./a/text()')[0].extract() #只有加上 .extract()才能提取真实内容
# url
newsUrl = news.xpath('./a/@href')[0].extract()
# 新闻时间
newsTime = news.xpath('./span/text()')[0].extract()
# print(newsTitle, newsTime, newsUrl)
# 正文
content = self.getContent(newsUrl)
# print(content)
item['newsTitle'] = newsTitle # 新闻标题
item['newsUrl'] = newsUrl # url
item['newsTime'] = newsTime # 时间
item['content'] = content # 新闻内容
yield item
def getContent(self, url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36"}
response = requests.get(url, headers=headers).content.decode('utf-8', 'ignore')
mytree = lxml.etree.HTML(response)
contentList = mytree.xpath('//div[@id="article"]//text()')
content = ""
for c in contentList:
content += c.strip()
return content