=
=
2、爬取tencent官网招聘信息,存入json格式
Tencent官网
tencent招聘官网的json数据
先确定要爬取属性
items:
import scrapy
class TencentItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
RecruitPostName = scrapy.Field()
LocationName = scrapy.Field()
Responsibility = scrapy.Field()
LastUpdateTime = scrapy.Field()
PostURL = scrapy.Field()
SourceID = scrapy.Field()
然后进行spider编写:
import scrapy
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1626224165766&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=1&pageSize=10&language=zh-cn&area=cn
# https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1626224340036&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex=2&pageSize=10&language=zh-cn&area=cn
from Tencent.items import TencentItem
class TencentSpider(scrapy.Spider):
name = 'tencent'
# allowed_domains = ['https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1626224340036&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'.format(i) for i in range(10)]
url = 'https://careers.tencent.com/tencentcareer/api/post/Query?timestamp=1626224340036&countryId=&cityId=&bgIds=&productId=&categoryId=&parentCategoryId=&attrId=&keyword=&pageIndex={}&pageSize=10&language=zh-cn&area=cn'
page_num = 1
start_urls = [url.format(page_num)]
def parse(self, response):
global false, null, true
false = null = true = ''
positionList = eval(response.text)['Data']['Posts']
for position in positionList:
item = TencentItem()
item['RecruitPostName'] = position['RecruitPostName']
item['LocationName'] = position['LocationName']
item['Responsibility'] = position['Responsibility']
item['LastUpdateTime'] = position['LastUpdateTime']
item['PostURL'] = position['PostURL']
item['SourceID'] = position['SourceID']
yield item
if self.page_num <= 10:
self.page_num += 1
url = self.url.format(self.page_num)
yield scrapy.Request(url, callback=self.parse)
最后管道文件:
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import json
class TencentPipeline:
fp = None
def open_spider(self, spider):
self.fp = open('./tencent.json', 'w', encoding='utf-8')
def process_item(self, item, spider):
content = json.dumps(dict(item), ensure_ascii=False) + ',\n'
self.fp.write(content)
return item
def close_spider4p(self, spider):
self.fp.close()
=
=
3、爬取东莞阳光政务平台投诉情况及详情页
阳光政务平台
确定需要的属性:
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class SunproItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
href = scrapy.Field()
start = scrapy.Field()
time = scrapy.Field()
question = scrapy.Field()
spider
import scrapy
from sunPro.items import SunproItem
class Sun0769Spider(scrapy.Spider):
name = 'sun0769'
# allowed_domains = ['www.sun0769.com']
page = 1
url = "https://wz.sun0769.com/political/index/search?keyword=%E6%8A%95%E8%AF%89&page="
start_urls = ['https://wz.sun0769.com/political/index/search?keyword=%E6%8A%95%E8%AF%89&page=1']
def parse(self, response):
title_list = response.xpath('/html/body/div[2]/div[3]/ul/li')
for title in title_list:
item = SunproItem()
item['href'] = title.xpath('./span[3]/a/@href').extract_first()
item['start'] = title.xpath('./span[2]/text()').extract_first()
item['time'] = title.xpath('./comment()/text()').extract_first()
print(item['href'], type(item['href']))
new_url = 'https://wz.sun0769.com' + item['href']
yield scrapy.Request(
new_url,
callback=self.parse_detail,
meta={'item':item}
)
# 翻页
if self.page<100:
self.page += 1
next_url = self.url + self.page
yield scrapy.Request(
url=next_url,
callback=self.parse
)
def parse_detail(self, response):
item = response.meta['item']
item['question'] = response.xpath('/html/body/div[3]/div[2]/div[2]/div[2]/pre/text()').extract_first()
yield item
piplines
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import re
class SunproPipeline:
def process_content(self, content):
content = [re.sub(r"\xa0|\s","",i) for i in content]
content = [i for i in content if len(i)>0]
return content
def process_item(self, item, spider):
item['question'] = self.process_content(item['question'])
print(item)
return item