腾讯职位爬虫教程：掌握信息抓取与岗位解析-优快云博客

本文链接：https://blog.youkuaiyun.com/zhangshillin/article/details/120648759

这个是我从我老师那里搬运的，嘿嘿
1. 在这里插入图片描述
2.settings.py 需要改的

BOT_NAME = 'Tencet'

SPIDER_MODULES = ['Tencet.spiders']
NEWSPIDER_MODULE = 'Tencet.spiders'

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.35 Safari/537.36'

# 是否遵守robots协议，默认为True，表示遵守，通常要改为False
ROBOTSTXT_OBEY = False

# 最大并发请求数量，默认是16，没有代理服务器，设置大了，就容易被封掉，因此设置为1
CONCURRENT_REQUESTS = 1  # 单线程下载

DOWNLOAD_DELAY = 2  # 下载延迟为2秒

DEFAULT_REQUEST_HEADERS = {
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
  'Accept-Language': 'en',
}

3.tencent.py

import json
import math
import urllib
import scrapy


class TencentSpider(scrapy.Spider):
    """
    对一级页面发送请求，获取岗位的postId，拿到postId再构建二级页面url，然后才能获取岗位信息
    首先得知道总的页数，知道总页数之后，所有页面的url也就知道了，要拿到总页数，只要知道搜索的岗位就能确定了
    """
    name = 'tencent'
    allowed_domains = ['careers.tencent.com']

    job = input('请输入你要搜索的工作岗位：')
    # 对url进行编码
    encode_job = urllib.parse.quote(job)
    # start_urls = ['http://careers.tencent.com/']

    # 一级页面url，keyword表示搜索的岗位，pageIndex表示当前页面索引
    first_url = "https://careers.tencent.com/tencentcareer/api/post/Query?" \
                "timestamp=1631863485839&countryId=&cityId=&bgIds=&productId=&" \
                "categoryId=&parentCategoryId=&attrId=&keyword={}&pageIndex={}&" \
                "pageSize=10&language=zh-cn&area=cn"

    # 分析url可以看到，不同岗位的url只是timestamp与postId不同
    second_urls = "https://careers.tencent.com/tencentcareer/api/post/"\
                  "ByPostId?timestamp=1631850857726&postId={}&language=zh-cn"

    start_urls = [first_url.format(encode_job, 1)]

    def parse(self,response):
        # 返回json字符串
        # 将json字符串转换为字典
        json_dict = json.loads(response.text)
        print(type(json_dict))
        print(json_dict)
        # 岗位总数
        job_counts = json_dict["Data"]["Count"]
        print("岗位总数：{}".format(job_counts))
        # 岗位总页数
        total_pages = math.ceil(job_counts / 10)
        print("岗位总页数：{}".format(total_pages))
        # 构建一级url
        for page in range(1,total_pages + 1):
            first_url = self.first_url.format(self.encode_job,page)
            # 将url交给调度器入队列，获取岗位的postId，callback表示用哪个方法去处理请求
            yield scrapy.Request(url=first_url,callback=self.parse_post_ids)

    def parse_post_ids(self,response):
        # 得到一个列表，列表里装的是字典，字典里就有PostId
        posts = json.loads(response.text)["Data"]["Posts"]
        # 遍历列表，拿到每个岗位的PostId
        for post in posts:
            post_id = post["PostId"]
            # 构建二级url
            second_url = self.second_urls.format(post_id)
            # 将url交给调度器入队列
            yield scrapy.Request(url=second_url, callback=self.parse_job)

    def parse_job(self,response):
        # 二级页面，岗位详情解析逻辑
        job = json.loads(response.text)["Data"]
        name = job['RecruitPostName']
        location = job['LocationName']
        category = job['CategoryName']
        responsibility = job['Responsibility']
        requirement = job['Requirement']
        time = job['LastUpdateTime']

        print(name)
        print(location)
        print(category)
        print(responsibility)
        print(requirement)
        print(time)

4.run.py

from scrapy import cmdline

cmdline.execute("scrapy crawl tencent".split())

用scrapy爬虫