Scrapy项目实例及详解（2）_scrapy api pageindex=1&pagesize=10-优快云博客

本文链接：https://blog.youkuaiyun.com/weixin_44457673/article/details/118735467

=
2、爬取tencent官网招聘信息，存入json格式
Tencent官网
 tencent招聘官网的json数据
先确定要爬取属性
items：

import scrapy


class TencentItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()

    RecruitPostName = scrapy.Field()
    LocationName = scrapy.Field()
    Responsibility = scrapy.Field()
    LastUpdateTime = scrapy.Field()
    PostURL = scrapy.Field()
    SourceID = scrapy.Field()

然后进行spider编写：

import scrapy

# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1626224165766&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1626224340036&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=2&pageSize=10&language=zh-cn&area=cn
from Tencent.items import TencentItem

class TencentSpider(scrapy.Spider):
    name = 'tencent'
    # allowed_domains = ['https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1626224340036&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(i) for i in range(10)]
    url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1626224340036&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'
    page_num = 1

    start_urls = [url.format(page_num)]

    def parse(self, response):

        global false, null, true
        false = null = true = ''

        positionList = eval(response.text)['Data']['Posts']
        for position in positionList:
            item = TencentItem()

            item['RecruitPostName'] = position['RecruitPostName']
            item['LocationName'] = position['LocationName']
            item['Responsibility'] = position['Responsibility']
            item['LastUpdateTime'] = position['LastUpdateTime']
            item['PostURL'] = position['PostURL']
            item['SourceID'] = position['SourceID']

            yield item

        if self.page_num <= 10:
            self.page_num += 1
            url = self.url.format(self.page_num)
            yield scrapy.Request(url, callback=self.parse)

最后管道文件：

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json

class TencentPipeline:
    fp = None

    def open_spider(self, spider):
        self.fp = open('./tencent.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        content = json.dumps(dict(item), ensure_ascii=False) + ',\n'
        self.fp.write(content)
        return item

    def close_spider4p(self, spider):
        self.fp.close()

3、爬取东莞阳光政务平台投诉情况及详情页

阳光政务平台
确定需要的属性：

# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html

import scrapy


class SunproItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    href = scrapy.Field()
    start = scrapy.Field()
    time = scrapy.Field()
    question = scrapy.Field()

spider

import scrapy

from sunPro.items import SunproItem

class Sun0769Spider(scrapy.Spider):
    name = 'sun0769'
    # allowed_domains = ['www.sun0769.com']
    page = 1
    url = "https://wz.sun0769.com/political/index/search?keyword=%E6%8A%95%E8%AF%89&page="
    start_urls = ['https://wz.sun0769.com/political/index/search?keyword=%E6%8A%95%E8%AF%89&page=1']

    def parse(self, response):
        title_list = response.xpath('/html/body/div[2]/div[3]/ul/li')
        for title in title_list:
            item =  SunproItem()

            item['href'] = title.xpath('./span[3]/a/@href').extract_first()
            item['start'] = title.xpath('./span[2]/text()').extract_first()
            item['time'] = title.xpath('./comment()/text()').extract_first()

            print(item['href'], type(item['href']))
            new_url = 'https://wz.sun0769.com' + item['href']

            yield scrapy.Request(
                new_url,
                callback=self.parse_detail,
                meta={'item':item}
            )

        # 翻页
        if self.page<100:
            self.page += 1
            next_url = self.url + self.page
            yield scrapy.Request(
                url=next_url,
                callback=self.parse
            )


    def parse_detail(self, response):
        item = response.meta['item']
        item['question'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first()

        yield item

piplines

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html


# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import re

class SunproPipeline:
    def process_content(self, content):
        content = [re.sub(r"\xa0|\s","",i) for i in content]
        content = [i for i in content if len(i)>0]
        return content


    def process_item(self, item, spider):
        item['question'] = self.process_content(item['question'])
        print(item)
        return item