准备工作:
安装有minicoda3,scrapyd,redis和mongodb的Ubuntu虚拟机,主机安装有gerapy==0.9.12版本
第一部分:
代码
首先创建scrapy的一个项目。
spider里的代码如下:
from scrapy.http import HtmlResponse, Request
from scrapy_redis.spiders import RedisSpider
class JobSpider(RedisSpider):
name = "job"
redis_key = 'job:start_urls'
def start_requests(self):
start_url = 'https://careers.tencent.com/tencentcareer/api/post/Query?pageIndex=1&pageSize=10'
self.server.lpush(self.redis_key, start_url)
# 确保返回父类的生成器对象
yield from super().start_requests()
def parse(self, response: HtmlResponse, **kwargs):
print("Parsing:", response.url)
job_list = response.json()['Data']['Posts']
for job in job_list:
item = dict()
item['RecruitPostName'] = job['RecruitPostName']
item['Responsibility'] = job['Responsibility']
item['RequireWorkYearsName'] = job['RequireWorkYearsName']
yield item
yield from self.next_page(