这个是我从我老师那里搬运的,嘿嘿
1.
2.settings.py 需要改的
BOT_NAME = 'Tencet'
SPIDER_MODULES = ['Tencet.spiders']
NEWSPIDER_MODULE = 'Tencet.spiders'
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.35 Safari/537.36'
# 是否遵守robots协议,默认为True,表示遵守,通常要改为False
ROBOTSTXT_OBEY = False
# 最大并发请求数量,默认是16,没有代理服务器,设置大了,就容易被封掉,因此设置为1
CONCURRENT_REQUESTS = 1 # 单线程下载
DOWNLOAD_DELAY = 2 # 下载延迟为2秒
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
}
3.tencent.py
import json
import math
import urllib
import scrapy
class TencentSpider(scrapy.Spider):
"""
对一级页面发送请求,获取岗位的postId,拿到postId再构建二级页面url,然后才能获取岗位信息
首先得知道总的页数,知道总页数之后,所有页面的url也就知道了,要拿到总页数,只要知道搜索的岗位就能确定了
"""
name = 'tencent'
allowed_domains = ['careers.tencent.com']
job = input('请输入你要搜索的工作岗位:')
# 对url进行编码
encode_job = urllib.parse.quote(job)
# start_urls = ['http://careers.tencent.com/']
# 一级页面url,keyword表示搜索的岗位,pageIndex表示当前页面索引
first_url = "https://careers.tencent.com/tencentcareer/api/post/Query?" \
"timestamp=1631863485839&countryId=&cityId=&bgIds=&productId=&" \
"categoryId=&parentCategoryId=&attrId=&keyword={}&pageIndex={}&" \
"pageSize=10&language=zh-cn&area=cn"
# 分析url可以看到,不同岗位的url只是timestamp与postId不同
second_urls = "https://careers.tencent.com/tencentcareer/api/post/"\
"ByPostId?timestamp=1631850857726&postId={}&language=zh-cn"
start_urls = [first_url.format(encode_job, 1)]
def parse(self,response):
# 返回json字符串
# 将json字符串转换为字典
json_dict = json.loads(response.text)
print(type(json_dict))
print(json_dict)
# 岗位总数
job_counts = json_dict["Data"]["Count"]
print("岗位总数:{}".format(job_counts))
# 岗位总页数
total_pages = math.ceil(job_counts / 10)
print("岗位总页数:{}".format(total_pages))
# 构建一级url
for page in range(1,total_pages + 1):
first_url = self.first_url.format(self.encode_job,page)
# 将url交给调度器入队列,获取岗位的postId,callback表示用哪个方法去处理请求
yield scrapy.Request(url=first_url,callback=self.parse_post_ids)
def parse_post_ids(self,response):
# 得到一个列表,列表里装的是字典,字典里就有PostId
posts = json.loads(response.text)["Data"]["Posts"]
# 遍历列表,拿到每个岗位的PostId
for post in posts:
post_id = post["PostId"]
# 构建二级url
second_url = self.second_urls.format(post_id)
# 将url交给调度器入队列
yield scrapy.Request(url=second_url, callback=self.parse_job)
def parse_job(self,response):
# 二级页面,岗位详情解析逻辑
job = json.loads(response.text)["Data"]
name = job['RecruitPostName']
location = job['LocationName']
category = job['CategoryName']
responsibility = job['Responsibility']
requirement = job['Requirement']
time = job['LastUpdateTime']
print(name)
print(location)
print(category)
print(responsibility)
print(requirement)
print(time)
4.run.py
from scrapy import cmdline
cmdline.execute("scrapy crawl tencent".split())