由于本人不怎么会组织语言讲解, 所以就直接上代码了,见谅,不会的可以问我。
1. requests爬取
import requests
import time
headers = {
'Referer': 'https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?labelWords=sug&fromSearch=true&suginput=python',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
session = requests.session()
session.headers.update(headers)
session.get('https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?labelWords=sug&fromSearch=true&suginput=python')
data = {
'first': 'false',
'pn': 1,
'kd': 'python爬虫'
}
'''
https://www.lagou.com/jobs/companyAjax.json
https://www.lagou.com/jobs/positionAjax.json
地址注意不要写错了
'''
r1 = session.post(url='https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false',
headers=headers,
data=data)
data = r1.json()
for result in data['content']['positionResult']['result']:
print(result['positionName'].strip())
print(result['salary'])
time.sleep(1)
2. scrapy爬取
import time
import scrapy
import json
class LagouSpider(scrapy.Spider):
name = 'lagou'
allowed_domains = ['www.lagou.com']
def start_requests(self):
headers = {
'Referer': 'https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%A'
'B?labelWords=sug&fromSearch=true&suginput=python',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKi'
't/537.36 (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36'
}
yield scrapy.Request(
'https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?labelWords=sug&fromSearch=true&suginput=python',
headers=headers, meta={'cookiejar': 1}, callback=self.do_post)
def do_post(self, response):
Cookie1 = response.headers.getlist('Set-Cookie')
post_url = 'https://www.lagou.com/jobs/positionAjax.json?city=%E5%8C%97%E4%BA%AC&needAddtionalResult=false'
headers = {
'Referer': 'https://www.lagou.com/jobs/list_python%E7%88%AC%E8%99%AB?'
'labelWords=sug&fromSearch=true&suginput=python',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
' (KHTML, like Gecko) Chrome/75.0.3770.80 Safari/537.36', }
for i in range(1, 7):
data = {
'first': 'false',
'pn': str(i),
'kd': 'python爬虫'
}
yield scrapy.FormRequest(
url=post_url,
formdata=data, headers=headers,
meta={'cookiejar': response.meta['cookiejar']},
callback=self.parse)
def parse(self, response):
Cookie2 = response.request.headers.getlist('Cookie')
data = json.loads(response.text)
for result in data['content']['positionResult']['result']:
print(result['positionName'].strip())
print(result['salary'])
time.sleep(1)
可以拿最新的腾讯招聘练练手 哈哈