scrapy里多个yield scrapy.Request
import scrapy
import re
import requests
import json
import time
from ..items import InduspiderItem
from newspaper import Article
from gne import GeneralNewsExtractor
from date_extractor import extract_date
from lxml import etree
from urllib import parse
class KathmanduSpider(scrapy.Spider):
name = 'Kathmandu'
allowed_domains = ['kathmandupost.com']
start_urls = ['https://kathmandupost.com/politics']
def parse(self, response):
list_ = response.xpath('//div/article/a/@href')
for u in list_:
item = InduspiderItem()
c = u.get()
url = 'https://kathmandupost.com' + c
item['_id'] = url
item['yuming'] = 'www.kathmandupost.com'
yield scrapy.Request(url=url, meta={'meta_1': item}, callback=self.get_two_page)
html = response.body
tex = html.decode(response.encoding)
regex = "<script>var last = '(.*?)';var requestRunning"
par = re.compile(regex, re.S)
r_ = par.findall(tex)
param = parse.urlencode({'pub': r_[0]})
one_url = 'https://kathmandupost.com/politics?html=1&{}'.format(param)
yield scrapy.Request(url=one_url, headers={'referer': 'https://kathmandupost.com/politics',
'x-requested-with': 'XMLHttpRequest'}, callback=self.get_url)
def get_url(self, response):
html = response.body
tex = html.decode(response.encoding)
ht = json.loads(tex)
pars = etree.HTML(ht['news_list_html'])
lis = pars.xpath('//div/article/a/@href')
for u in lis:
item = InduspiderItem()
d = u
url = 'https://kathmandupost.com' + d
item['_id'] = url
item['yuming'] = 'www.kathmandupost.com'
yield scrapy.Request(url=url, meta={'meta_1': item}, callback=self.get_two_page)
tm = ht['date']
par = parse.urlencode({'pub': tm})
ur = 'https://kathmandupost.com/politics?html=1&{}'.format(par)
yield scrapy.Request(url=ur, headers={'referer': 'https://kathmandupost.com/politics',
'x-requested-with': 'XMLHttpRequest'}, callback=self.get_url)
def get_two_page(self, response):
html = response.body
tex = html.decode(response.encoding)
extr = GeneralNewsExtractor()
resul = extr.extract(tex, author_xpath='//div/h5/a/text()',
publish_time_xpath='//div/div[@class="updated-time"][2]/text()')
news = Article('')
news.download(html)
news.parse()
item = response.meta['meta_1']
item['title'] = resul['title']
item['updatetime'] = resul['publish_time']
item['author'] = resul['author']
item['content'] = resul['content']
item['img'] = news.top_image
item['timestamp'] = time.time()
item['video'] = news.movies
b = extract_date(resul['publish_time'])
item['format_time'] = b
yield item