爬虫（猫眼电影+校花网+github+今日头条+拉钩）-优快云博客

Requests+正则表达式爬取猫眼TOP100榜电影信息

MARK：将信息写入文件解决乱码方法，开启进程池秒爬。

 
          import  
          requests 
         
          from  
          requests.exceptions  
          import  
          RequestException 
         
          import  
          re 
         
          import  
          json 
         
          from  
          multiprocessing  
          import  
          Pool 
         
          def  
          get_one_page(url): 
         
          try 
          : 
         
          response  
          =  
          requests.get(url) 
         
          if  
          response.status_code  
          = 
          =  
          200 
          : 
         
          return  
          response.text 
         
          return  
          None 
         
          except  
          RequestException: 
         
          return  
          None 
         
          def  
          parse_one_page(html): 
         
          pattern  
          =  
          re. 
          compile 
          ( 
          '<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a' 
         
          +  
          '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>' 
         
          +  
          '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>' 
          , re.S) 
         
          items  
          =  
          re.findall(pattern, html) 
         
          for  
          item  
          in  
          items: 
         
          yield  
          { 
         
          '排行' 
          : item[ 
          0 
          ], 
         
          '图片' 
          : item[ 
          1 
          ], 
         
          '电影' 
          : item[ 
          2 
          ], 
         
          '演员' 
          : item[ 
          3 
          ].strip()[ 
          3 
          :], 
         
          '上映信息' 
          : item[ 
          4 
          ].strip()[ 
          5 
          :], 
         
          '评分' 
          : item[ 
          5 
          ]  
          +  
          item[ 
          6 
          ] 
         
          } 
         
          def  
          write_to_file(content): 
         
          with  
          open 
          ( 
          'result.txt' 
          ,  
          'a' 
          , encoding 
          = 
          'utf-8' 
          ) as f: 
         
          f.write(json.dumps(content, ensure_ascii 
          = 
          False 
          )  
          +  
          '\n' 
          ) 
         
          def  
          main(offset): 
         
          url  
          =  
          'http://maoyan.com/board/4?offset='  
          +  
          str 
          (offset) 
         
          html  
          =  
          get_one_page(url) 
         
          for  
          item  
          in  
          parse_one_page(html): 
         
          print 
          (item) 
         
          write_to_file(item) 
         
          if  
          __name__  
          = 
          =  
          '__main__' 
          : 
         
          # for i in range(10): 
         
          #   main(i*10) 
         
          pool  
          =  
          Pool()   
          # 进程池 多进程 
         
          pool. 
          map 
          (main, [i  
          *  
          10  
          for  
          i  
          in  
          range 
          ( 
          10 
          )])

Requests+正则表达式爬取校花网视频

 
          import  
          requests 
         
          import  
          re 
         
          import  
          os 
         
          def  
          get_page(url): 
         
          try 
          : 
         
          response  
          =  
          requests.get(url) 
         
          response.raise_for_status() 
         
          response.encoding  
          =  
          response.apparent_encoding 
         
          return  
          response.text 
         
          except 
          : 
         
          print 
          ( 
          "爬取失败" 
          ) 
         
          def  
          get_url(html): 
         
          pattern  
          =  
          re. 
          compile 
          ( 
          'class="items".*?href="(.*?)"' 
          , re.S) 
         
          urls  
          =  
          re.findall(pattern, html) 
         
          for  
          url  
          in  
          urls: 
         
          if  
          not  
          url.startswith( 
          'http' 
          ): 
         
          url  
          =  
          'http://www.xiaohuar.com'  
          +  
          url 
         
          yield  
          url 
         
          def  
          get_detail_url(detail_content): 
         
          pattern  
          =  
          re. 
          compile 
          ( 
          'id="media".*?src="(.*?)"' 
          , re.S) 
         
          urls  
          =  
          re.findall(pattern, detail_content) 
         
          for  
          url  
          in  
          urls: 
         
          if  
          url: 
         
          if  
          url.endswith( 
          '.mp4' 
          ): 
         
          yield  
          url 
         
          def  
          download(url): 
         
          root  
          =  
          "D://movie2//" 
         
          path  
          =  
          root  
          +  
          url.split( 
          '/' 
          )[ 
          - 
          1 
          ] 
         
          try 
          : 
         
          if  
          not  
          os.path.exists(root): 
         
          os.mkdir(root) 
         
          if  
          not  
          os.path.exists(path): 
         
          response  
          =  
          requests.get(url) 
         
          # with open(path, 'wb') as f: 
         
          #     f.write(response.content) 
         
          with  
          open 
          (path,  
          'wb' 
          ) as f: 
         
          for  
          line  
          in  
          response.iter_content(): 
         
          f.write(line) 
         
          print 
          ( 
          "文件保存成功" 
          ) 
         
          else 
          : 
         
          print 
          ( 
          "文件已存在" 
          ) 
         
          except 
          : 
         
          print 
          ( 
          "下载失败" 
          ) 
         
          def  
          main(page_num): 
         
          url  
          =  
          'http://www.xiaohuar.com/list-3-{0}.html' 
          . 
          format 
          (page_num) 
         
          html  
          =  
          get_page(url) 
         
          urls  
          =  
          get_url(html) 
         
          for  
          url  
          in  
          urls: 
         
          detail_content  
          =  
          get_page(url) 
         
          detail_urls  
          =  
          get_detail_url(detail_content) 
         
          for  
          detail_url  
          in  
          detail_urls: 
         
          download(detail_url) 
         
          if  
          __name__  
          = 
          =  
          '__main__' 
          : 
         
          for  
          num  
          in  
          range 
          ( 
          30 
          ): 
         
          main(num)

Requests+PyQuery模拟登陆github

 
          import  
          requests 
         
          from  
          pyquery  
          import  
          PyQuery 
         
          LOGIN_URL  
          =  
          'https://github.com/login' 
         
          SESSION_URL  
          =  
          'https://github.com/session' 
         
          session  
          =  
          requests.session() 
         
          response  
          =  
          session.get(LOGIN_URL) 
         
          text  
          =  
          PyQuery(response.text) 
         
          authenticity_token  
          =  
          text( 
          '#login > form > div:nth-child(1) > input[type="hidden"]:nth-child(2)' 
          ).attr( 
          'value' 
          ) 
         
          data  
          =  
          { 
         
          'commit' 
          :  
          'Sign in' 
          , 
         
          'utf8' 
          :  
          '✓' 
          , 
         
          'authenticity_token' 
          : authenticity_token, 
         
          'login' 
          :  
          'lcgsmile@qq.com' 
          , 
         
          'password' 
          :  
          'lcg@pwd.' 
         
          } 
         
          response  
          =  
          session.post(SESSION_URL, data 
          = 
          data) 
         
          print 
          (response.status_code)   
          # 200

分析Ajax请求并抓取今日头条街拍美图

配置文件config.py

 
          MONGO_URL  
          =  
          'localhost' 
         
          MONGO_DB  
          =  
          'toutiao' 
         
          MONGO_TABLE  
          =  
          'toutiao' 
         
          GROUP_START  
          =  
          1 
         
          GROUP_END  
          =  
          20 
         
          KEYWORD  
          =  
          '街拍'

主爬虫文件

 
          import  
          json 
         
          import  
          os 
         
          from  
          urllib.parse  
          import  
          urlencode 
         
          import  
          pymongo 
         
          import  
          requests 
         
          from  
          bs4  
          import  
          BeautifulSoup 
         
          from  
          requests.exceptions  
          import  
          ConnectionError 
         
          import  
          re 
         
          from  
          multiprocessing  
          import  
          Pool 
         
          from  
          hashlib  
          import  
          md5 
         
          from  
          json.decoder  
          import  
          JSONDecodeError 
         
          from  
          config  
          import  
          * 
         
          client  
          =  
          pymongo.MongoClient(MONGO_URL, connect 
          = 
          False 
          )   
          # 多进程抓取connect=False 
         
          db  
          =  
          client[MONGO_DB] 
         
          def  
          get_page_index(offset, keyword): 
         
          """ 
         
          爬取索引页 
         
          """ 
         
          data  
          =  
          { 
         
          'autoload' 
          :  
          'true' 
          , 
         
          'count' 
          :  
          20 
          , 
         
          'cur_tab' 
          :  
          3 
          , 
         
          'format' 
          :  
          'json' 
          , 
         
          'keyword' 
          : keyword, 
         
          'offset' 
          : offset, 
         
          } 
         
          params  
          =  
          urlencode(data)   
          # 将字典类型构造成url的请求参数 
         
          base  
          =  
          'http://www.toutiao.com/search_content/' 
         
          url  
          =  
          base  
          +  
          '?'  
          +  
          params 
         
          try 
          : 
         
          response  
          =  
          requests.get(url) 
         
          if  
          response.status_code  
          = 
          =  
          200 
          : 
         
          return  
          response.text 
         
          return  
          None 
         
          except  
          ConnectionError: 
         
          print 
          ( 
          'Error occurred' 
          ) 
         
          return  
          None 
         
          def  
          download_image(url): 
         
          """ 
         
          下载图片 
         
          """ 
         
          print 
          ( 
          'Downloading' 
          , url) 
         
          try 
          : 
         
          response  
          =  
          requests.get(url) 
         
          if  
          response.status_code  
          = 
          =  
          200 
          : 
         
          save_image(response.content) 
         
          return  
          None 
         
          except  
          ConnectionError: 
         
          return  
          None 
         
          def  
          save_image(content): 
         
          """ 
         
          保存图片 
         
          """ 
         
          file_path  
          =  
          '{0}/{1}.{2}' 
          . 
          format 
          (os.getcwd(), md5(content).hexdigest(),  
          'jpg' 
          ) 
         
          # 用一个md5哈希生成的文件名防止重复 
         
          print 
          (file_path) 
         
          if  
          not  
          os.path.exists(file_path): 
         
          with  
          open 
          (file_path,  
          'wb' 
          ) as f: 
         
          f.write(content) 
         
          def  
          parse_page_index(text): 
         
          """ 
         
          解析数据 
         
          """ 
         
          try 
          : 
         
          data  
          =  
          json.loads(text)   
          # json字符串转换成字典 
         
          if  
          data  
          and  
          'data'  
          in  
          data.keys(): 
         
          for  
          item  
          in  
          data.get( 
          'data' 
          ): 
         
          yield  
          item.get( 
          'article_url' 
          ) 
         
          except  
          JSONDecodeError: 
         
          pass 
         
          def  
          get_page_detail(url): 
         
          """ 
         
          请求详情页 
         
          """ 
         
          try 
          : 
         
          response  
          =  
          requests.get(url) 
         
          if  
          response.status_code  
          = 
          =  
          200 
          : 
         
          return  
          response.text 
         
          return  
          None 
         
          except  
          ConnectionError: 
         
          print 
          ( 
          'Error occurred' 
          ) 
         
          return  
          None 
         
          def  
          parse_page_detail(html, url): 
         
          """ 
         
          解析详情页 
         
          """ 
         
          soup  
          =  
          BeautifulSoup(html,  
          'lxml' 
          ) 
         
          result  
          =  
          soup.select( 
          'title' 
          ) 
         
          title  
          =  
          result[ 
          0 
          ].get_text()  
          if  
          result  
          else  
          '' 
         
          images_pattern  
          =  
          re. 
          compile 
          ( 
          'gallery: JSON.parse\("(.*)"\)' 
          , re.S) 
         
          result  
          =  
          re.search(images_pattern, html) 
         
          if  
          result: 
         
          data  
          =  
          json.loads(result.group( 
          1 
          ).replace( 
          '\\', ' 
          ')) 
         
          if  
          data  
          and  
          'sub_images'  
          in  
          data.keys(): 
         
          sub_images  
          =  
          data.get( 
          'sub_images' 
          ) 
         
          images  
          =  
          [item.get( 
          'url' 
          )  
          for  
          item  
          in  
          sub_images] 
         
          for  
          image  
          in  
          images: download_image(image) 
         
          return  
          { 
         
          'title' 
          : title, 
         
          'url' 
          : url, 
         
          'images' 
          : images 
         
          } 
         
          def  
          save_to_mongo(result): 
         
          """ 
         
          将数据插入到MongoDB 
         
          """ 
         
          if  
          db[MONGO_TABLE].insert(result): 
         
          print 
          ( 
          'Successfully Saved to Mongo' 
          , result) 
         
          return  
          True 
         
          return  
          False 
         
          def  
          main(offset): 
         
          text  
          =  
          get_page_index(offset, KEYWORD) 
         
          urls  
          =  
          parse_page_index(text) 
         
          for  
          url  
          in  
          urls: 
         
          html  
          =  
          get_page_detail(url) 
         
          result  
          =  
          parse_page_detail(html, url) 
         
          if  
          result: save_to_mongo(result) 
         
          if  
          __name__  
          = 
          =  
          '__main__' 
          : 
         
          pool  
          =  
          Pool() 
         
          groups  
          =  
          ([x  
          *  
          20  
          for  
          x  
          in  
          range 
          (GROUP_START, GROUP_END  
          +  
          1 
          )]) 
         
          pool. 
          map 
          (main, groups) 
         
          pool.close() 
         
          pool.join()

拉勾网自动投递简历

import requests
import re

# 1、============================================认证流程
session = requests.session()
# 第一步：
# 请求的URL：https://passport.lagou.com/login/login.html，
# 请求的方法GET，
# 请求头只包含User-agent

r1 = session.get('https://passport.lagou.com/login/login.html',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 },
                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# print(X_Anti_Forge_Code)
# print(X_Anti_Forge_Token)


# 第二步：
# 1、请求的URL:https://passport.lagou.com/login/login.json,
# 2、请求方法POST，
# 3、请求头：
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:
#   X-Anit-Forge-Code
#   X-Anit-Forge-Token
#   X-Requested-With
# 4、请求体：
# isValidate:true
# username:1111111111
# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
session.post('https://passport.lagou.com/login/login.json',
             headers={
                 'Referer': 'https://passport.lagou.com/login/login.html',
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 'X-Anit-Forge-Code': X_Anti_Forge_Code,
                 'X-Anit-Forge-Token': X_Anti_Forge_Token,
                 'X-Requested-With': 'XMLHttpRequest'
             },
             data={
                 'isValidate': True,
                 'username': '18611453110',
                 'password': '70621c64832c4d4d66a47be6150b4a8e'
             }
             )

# 第三：
# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
# 2、请求方法GET，
# 3、请求头：
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:

session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
            headers={
                'Referer': 'https://passport.lagou.com/login/login.html',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
            }
            )

# 验证
response = session.get('https://www.lagou.com/resume/myresume.html',
                       headers={
                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                       }
                       )

# print('18611453110' in response.text)



# 2、============================================爬取职位信息
# 1、请求的url：https://www.lagou.com/jobs/positionAjax.json
# 2、请求的方式：POST
#   请求参数：
#     gj:3年及以下
#     xl:不要求
#     jd:不需要融资
#     hy:移动互联网
#     px:default
#     yx:15k-25k
#     city:全国
# 3、请求头：
# User-Agent
# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
# X-Anit-Forge-Code:0
# X-Anit-Forge-Token:None
# X-Requested-With:XMLHttpRequest

# 4、请求体：
# first:true
# pn:1
# kd:python数据分析

from urllib.parse import urlencode

params = {'kw': 'python数据分析'}
res = urlencode(params).split('=')[-1]
url = 'https://www.lagou.com/jobs/list_' + res
# print(url)


response = session.post('https://www.lagou.com/jobs/positionAjax.json',
                        params={
                            # 'gj': '3年及以下',
                            # 'xl': '不要求',
                            # 'jd': '不需要融资',
                            # 'hy': '移动互联网',
                            'px': 'default',
                            'yx': '15k-25k',
                            'city': '北京',
                            'district': '海淀区',

                        },
                        headers={
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                            'Referer': url,

                        })

# print(response.status_code)
result = response.json()['content']['positionResult']['result']
for comanpy_info in result:
    fullname = comanpy_info['companyFullName']
    emp_num = comanpy_info['companySize']
    salary = comanpy_info['salary']
    workyear = comanpy_info['workYear']
    positionName = comanpy_info['positionName']
    positionId = comanpy_info['positionId']
    detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)

    print(detail_url)
    print(fullname)
    print(emp_num)
    print(salary)
    print(workyear)
    print(positionName)
    print(positionId)
    print()

    # 3、============================================爬取职位信息
    # 第一步：请求详情页：
    # 1、请求的detail_url：https://www.lagou.com/jobs/3984845.html
    # 2、请求的方式：GET
    # 3、请求头：
    #    User-Agent
    r1 = session.get(detail_url,
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     }
                     )

    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

    # 第二步：投递简历
    # 1、请求的url：https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
    # 2、请求的方式：POST
    # 3、请求头：
    # User-Agent
    # Referer:detail_url
    # X-Anit-Forge-Code:31832262
    # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
    # X-Requested-With:XMLHttpRequest

    # 4、请求体：
    # 'positionId':3984845
    # 'type':1
    # 'force':True

    session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     'Referer': detail_url,
                     'X-Anit-Forge-Code': X_Anti_Forge_Code,
                     'X-Anit-Forge-Token': X_Anti_Forge_Token,
                     'X-Requested-With': 'XMLHttpRequest'
                 },
                 data={
                     'positionId': positionId,
                     'type': 1,
                     'force': True
                 }

                 )

    print('投递成功',detail_url)

lagou

import requests
import re

# 1、============================================认证流程
session = requests.session()
# 第一步：
# 请求的URL：https://passport.lagou.com/login/login.html，
# 请求的方法GET，
# 请求头只包含User-agent

r1 = session.get('https://passport.lagou.com/login/login.html',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 },
                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# print(X_Anti_Forge_Code)
# print(X_Anti_Forge_Token)


# 第二步：
# 1、请求的URL:https://passport.lagou.com/login/login.json,
# 2、请求方法POST，
# 3、请求头：
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:
#   X-Anit-Forge-Code
#   X-Anit-Forge-Token
#   X-Requested-With
# 4、请求体：
# isValidate:true
# username:1111111111
# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
session.post('https://passport.lagou.com/login/login.json',
             headers={
                 'Referer': 'https://passport.lagou.com/login/login.html',
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 'X-Anit-Forge-Code': X_Anti_Forge_Code,
                 'X-Anit-Forge-Token': X_Anti_Forge_Token,
                 'X-Requested-With': 'XMLHttpRequest'
             },
             data={
                 'isValidate': True,
                 'username': '18611453110',
                 'password': '70621c64832c4d4d66a47be6150b4a8e'
             }
             )

# 第三：
# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
# 2、请求方法GET，
# 3、请求头：
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:

session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
            headers={
                'Referer': 'https://passport.lagou.com/login/login.html',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
            }
            )

# 验证
response = session.get('https://www.lagou.com/resume/myresume.html',
                       headers={
                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                       }
                       )

# print('18611453110' in response.text)



# 2、============================================爬取职位信息
# 1、请求的url：https://www.lagou.com/jobs/positionAjax.json
# 2、请求的方式：POST
#   请求参数：
#     gj:3年及以下
#     xl:不要求
#     jd:不需要融资
#     hy:移动互联网
#     px:default
#     yx:15k-25k
#     city:全国
# 3、请求头：
# User-Agent
# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
# X-Anit-Forge-Code:0
# X-Anit-Forge-Token:None
# X-Requested-With:XMLHttpRequest

# 4、请求体：
# first:true
# pn:1
# kd:python数据分析

from urllib.parse import urlencode

params = {'kw': 'python数据分析'}
res = urlencode(params).split('=')[-1]
url = 'https://www.lagou.com/jobs/list_' + res
# print(url)


response = session.post('https://www.lagou.com/jobs/positionAjax.json',
                        params={
                            # 'gj': '3年及以下',
                            # 'xl': '不要求',
                            # 'jd': '不需要融资',
                            # 'hy': '移动互联网',
                            'px': 'default',
                            'yx': '15k-25k',
                            'city': '北京',
                            'district': '海淀区',

                        },
                        headers={
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                            'Referer': url,

                        })

# print(response.status_code)
result = response.json()['content']['positionResult']['result']
for comanpy_info in result:
    fullname = comanpy_info['companyFullName']
    emp_num = comanpy_info['companySize']
    salary = comanpy_info['salary']
    workyear = comanpy_info['workYear']
    positionName = comanpy_info['positionName']
    positionId = comanpy_info['positionId']
    detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)

    print(detail_url)
    print(fullname)
    print(emp_num)
    print(salary)
    print(workyear)
    print(positionName)
    print(positionId)
    print()

    # 3、============================================爬取职位信息
    # 第一步：请求详情页：
    # 1、请求的detail_url：https://www.lagou.com/jobs/3984845.html
    # 2、请求的方式：GET
    # 3、请求头：
    #    User-Agent
    r1 = session.get(detail_url,
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     }
                     )

    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

    # 第二步：投递简历
    # 1、请求的url：https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
    # 2、请求的方式：POST
    # 3、请求头：
    # User-Agent
    # Referer:detail_url
    # X-Anit-Forge-Code:31832262
    # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
    # X-Requested-With:XMLHttpRequest

    # 4、请求体：
    # 'positionId':3984845
    # 'type':1
    # 'force':True

    session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     'Referer': detail_url,
                     'X-Anit-Forge-Code': X_Anti_Forge_Code,
                     'X-Anit-Forge-Token': X_Anti_Forge_Token,
                     'X-Requested-With': 'XMLHttpRequest'
                 },
                 data={
                     'positionId': positionId,
                     'type': 1,
                     'force': True
                 }

                 )

    print('投递成功',detail_url)

import requests
import re

# 1、============================================认证流程
session = requests.session()
# 第一步：
# 请求的URL：https://passport.lagou.com/login/login.html，
# 请求的方法GET，
# 请求头只包含User-agent

r1 = session.get('https://passport.lagou.com/login/login.html',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 },
                 )

X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]
# print(X_Anti_Forge_Code)
# print(X_Anti_Forge_Token)


# 第二步：
# 1、请求的URL:https://passport.lagou.com/login/login.json,
# 2、请求方法POST，
# 3、请求头：
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:
#   X-Anit-Forge-Code
#   X-Anit-Forge-Token
#   X-Requested-With
# 4、请求体：
# isValidate:true
# username:1111111111
# password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714
session.post('https://passport.lagou.com/login/login.json',
             headers={
                 'Referer': 'https://passport.lagou.com/login/login.html',
                 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                 'X-Anit-Forge-Code': X_Anti_Forge_Code,
                 'X-Anit-Forge-Token': X_Anti_Forge_Token,
                 'X-Requested-With': 'XMLHttpRequest'
             },
             data={
                 'isValidate': True,
                 'username': '18611453110',
                 'password': '70621c64832c4d4d66a47be6150b4a8e'
             }
             )

# 第三：
# 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html,
# 2、请求方法GET，
# 3、请求头：
#   Referer:https://passport.lagou.com/login/login.html
#   User-Agent:

session.get('https://passport.lagou.com/grantServiceTicket/grant.html',
            headers={
                'Referer': 'https://passport.lagou.com/login/login.html',
                'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
            }
            )

# 验证
response = session.get('https://www.lagou.com/resume/myresume.html',
                       headers={
                           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                       }
                       )

# print('18611453110' in response.text)



# 2、============================================爬取职位信息
# 1、请求的url：https://www.lagou.com/jobs/positionAjax.json
# 2、请求的方式：POST
#   请求参数：
#     gj:3年及以下
#     xl:不要求
#     jd:不需要融资
#     hy:移动互联网
#     px:default
#     yx:15k-25k
#     city:全国
# 3、请求头：
# User-Agent
# Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD
# X-Anit-Forge-Code:0
# X-Anit-Forge-Token:None
# X-Requested-With:XMLHttpRequest

# 4、请求体：
# first:true
# pn:1
# kd:python数据分析

from urllib.parse import urlencode

params = {'kw': 'python数据分析'}
res = urlencode(params).split('=')[-1]
url = 'https://www.lagou.com/jobs/list_' + res
# print(url)


response = session.post('https://www.lagou.com/jobs/positionAjax.json',
                        params={
                            # 'gj': '3年及以下',
                            # 'xl': '不要求',
                            # 'jd': '不需要融资',
                            # 'hy': '移动互联网',
                            'px': 'default',
                            'yx': '15k-25k',
                            'city': '北京',
                            'district': '海淀区',

                        },
                        headers={
                            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                            'Referer': url,

                        })

# print(response.status_code)
result = response.json()['content']['positionResult']['result']
for comanpy_info in result:
    fullname = comanpy_info['companyFullName']
    emp_num = comanpy_info['companySize']
    salary = comanpy_info['salary']
    workyear = comanpy_info['workYear']
    positionName = comanpy_info['positionName']
    positionId = comanpy_info['positionId']
    detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId)

    print(detail_url)
    print(fullname)
    print(emp_num)
    print(salary)
    print(workyear)
    print(positionName)
    print(positionId)
    print()

    # 3、============================================爬取职位信息
    # 第一步：请求详情页：
    # 1、请求的detail_url：https://www.lagou.com/jobs/3984845.html
    # 2、请求的方式：GET
    # 3、请求头：
    #    User-Agent
    r1 = session.get(detail_url,
                     headers={
                         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     }
                     )

    X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0]
    X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0]

    # 第二步：投递简历
    # 1、请求的url：https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json
    # 2、请求的方式：POST
    # 3、请求头：
    # User-Agent
    # Referer:detail_url
    # X-Anit-Forge-Code:31832262
    # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7
    # X-Requested-With:XMLHttpRequest

    # 4、请求体：
    # 'positionId':3984845
    # 'type':1
    # 'force':True

    session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json',
                 headers={
                     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                     'Referer': detail_url,
                     'X-Anit-Forge-Code': X_Anti_Forge_Code,
                     'X-Anit-Forge-Token': X_Anti_Forge_Token,
                     'X-Requested-With': 'XMLHttpRequest'
                 },
                 data={
                     'positionId': positionId,
                     'type': 1,
                     'force': True
                 }

                 )

    print('投递成功',detail_url)

lagou

转载于:https://www.cnblogs.com/yunlongaimeng/p/9802151.html