scrapy多个yield反复横跳

博客主要围绕Scrapy展开,重点提及了Scrapy里多个yield scrapy.Request的相关内容,涉及Python和XPath技术。

摘要生成于 C知道 ,由 DeepSeek-R1 满血版支持, 前往体验 >

scrapy里多个yield scrapy.Request

import scrapy
import re
import requests
import json
import time
from ..items import InduspiderItem
from newspaper import Article
from gne import GeneralNewsExtractor
from date_extractor import extract_date
from lxml import etree
from urllib import parse


class KathmanduSpider(scrapy.Spider):
    name = 'Kathmandu'
    allowed_domains = ['kathmandupost.com']
    start_urls = ['https://kathmandupost.com/politics']

    def parse(self, response):
        list_ = response.xpath('//div/article/a/@href')
        for u in list_:
            item = InduspiderItem()
            c = u.get()
            url = 'https://kathmandupost.com' + c
            item['_id'] = url
            item['yuming'] = 'www.kathmandupost.com'
            yield scrapy.Request(url=url, meta={'meta_1': item}, callback=self.get_two_page)
        html = response.body
        tex = html.decode(response.encoding)
        regex = "<script>var last = '(.*?)';var requestRunning"
        par = re.compile(regex, re.S)
        r_ = par.findall(tex)
        param = parse.urlencode({'pub': r_[0]})
        one_url = 'https://kathmandupost.com/politics?html=1&{}'.format(param)
        yield scrapy.Request(url=one_url, headers={'referer': 'https://kathmandupost.com/politics',
                                                   'x-requested-with': 'XMLHttpRequest'}, callback=self.get_url)

    def get_url(self, response):
        # header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
        #                         'Chrome/90.0.4430.212 Safari/537.36',
        #           'referer': 'https://kathmandupost.com/politics',
        #           'x-requested-with': 'XMLHttpRequest'
        #           }

        # html = requests.get(url=url, headers=header).text
        html = response.body
        tex = html.decode(response.encoding)
        ht = json.loads(tex)
        pars = etree.HTML(ht['news_list_html'])
        lis = pars.xpath('//div/article/a/@href')
        for u in lis:
            item = InduspiderItem()
            d = u
            url = 'https://kathmandupost.com' + d
            item['_id'] = url
            item['yuming'] = 'www.kathmandupost.com'
            yield scrapy.Request(url=url, meta={'meta_1': item}, callback=self.get_two_page)

        tm = ht['date']
        par = parse.urlencode({'pub': tm})
        ur = 'https://kathmandupost.com/politics?html=1&{}'.format(par)
        yield scrapy.Request(url=ur, headers={'referer': 'https://kathmandupost.com/politics',
                                              'x-requested-with': 'XMLHttpRequest'}, callback=self.get_url)

    def get_two_page(self, response):
        html = response.body
        tex = html.decode(response.encoding)
        extr = GeneralNewsExtractor()
        resul = extr.extract(tex, author_xpath='//div/h5/a/text()',
                             publish_time_xpath='//div/div[@class="updated-time"][2]/text()')

        news = Article('')
        news.download(html)
        news.parse()

        item = response.meta['meta_1']
        item['title'] = resul['title']
        item['updatetime'] = resul['publish_time']
        item['author'] = resul['author']
        item['content'] = resul['content']
        item['img'] = news.top_image
        item['timestamp'] = time.time()
        item['video'] = news.movies
        b = extract_date(resul['publish_time'])
        item['format_time'] = b

        yield item

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值