Requests+正则表达式爬取猫眼TOP100榜电影信息
MARK:将信息写入文件解决乱码方法,开启进程池秒爬。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
|
import
requests
from
requests.exceptions
import
RequestException
import
re
import
json
from
multiprocessing
import
Pool
def
get_one_page(url):
try
:
response
=
requests.get(url)
if
response.status_code
=
=
200
:
return
response.text
return
None
except
RequestException:
return
None
def
parse_one_page(html):
pattern
=
re.
compile
(
'<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
+
'.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
+
'.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>'
, re.S)
items
=
re.findall(pattern, html)
for
item
in
items:
yield
{
'排行'
: item[
0
],
'图片'
: item[
1
],
'电影'
: item[
2
],
'演员'
: item[
3
].strip()[
3
:],
'上映信息'
: item[
4
].strip()[
5
:],
'评分'
: item[
5
]
+
item[
6
]
}
def
write_to_file(content):
with
open
(
'result.txt'
,
'a'
, encoding
=
'utf-8'
) as f:
f.write(json.dumps(content, ensure_ascii
=
False
)
+
'\n'
)
def
main(offset):
url
=
'http://maoyan.com/board/4?offset='
+
str
(offset)
html
=
get_one_page(url)
for
item
in
parse_one_page(html):
print
(item)
write_to_file(item)
if
__name__
=
=
'__main__'
:
# for i in range(10):
# main(i*10)
pool
=
Pool()
# 进程池 多进程
pool.
map
(main, [i
*
10
for
i
in
range
(
10
)])
|
Requests+正则表达式爬取校花网视频
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
|
import
requests
import
re
import
os
def
get_page(url):
try
:
response
=
requests.get(url)
response.raise_for_status()
response.encoding
=
response.apparent_encoding
return
response.text
except
:
print
(
"爬取失败"
)
def
get_url(html):
pattern
=
re.
compile
(
'class="items".*?href="(.*?)"'
, re.S)
urls
=
re.findall(pattern, html)
for
url
in
urls:
if
not
url.startswith(
'http'
):
url
=
'http://www.xiaohuar.com'
+
url
yield
url
def
get_detail_url(detail_content):
pattern
=
re.
compile
(
'id="media".*?src="(.*?)"'
, re.S)
urls
=
re.findall(pattern, detail_content)
for
url
in
urls:
if
url:
if
url.endswith(
'.mp4'
):
yield
url
def
download(url):
root
=
"D://movie2//"
path
=
root
+
url.split(
'/'
)[
-
1
]
try
:
if
not
os.path.exists(root):
os.mkdir(root)
if
not
os.path.exists(path):
response
=
requests.get(url)
# with open(path, 'wb') as f:
# f.write(response.content)
with
open
(path,
'wb'
) as f:
for
line
in
response.iter_content():
f.write(line)
print
(
"文件保存成功"
)
else
:
print
(
"文件已存在"
)
except
:
print
(
"下载失败"
)
def
main(page_num):
url
=
'http://www.xiaohuar.com/list-3-{0}.html'
.
format
(page_num)
html
=
get_page(url)
urls
=
get_url(html)
for
url
in
urls:
detail_content
=
get_page(url)
detail_urls
=
get_detail_url(detail_content)
for
detail_url
in
detail_urls:
download(detail_url)
if
__name__
=
=
'__main__'
:
for
num
in
range
(
30
):
main(num)
|
Requests+PyQuery模拟登陆github
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
|
import
requests
from
pyquery
import
PyQuery
LOGIN_URL
=
'https://github.com/login'
SESSION_URL
=
'https://github.com/session'
session
=
requests.session()
response
=
session.get(LOGIN_URL)
text
=
PyQuery(response.text)
authenticity_token
=
text(
'#login > form > div:nth-child(1) > input[type="hidden"]:nth-child(2)'
).attr(
'value'
)
data
=
{
'commit'
:
'Sign in'
,
'utf8'
:
'✓'
,
'authenticity_token'
: authenticity_token,
'login'
:
'lcgsmile@qq.com'
,
'password'
:
'lcg@pwd.'
}
response
=
session.post(SESSION_URL, data
=
data)
print
(response.status_code)
# 200
|
分析Ajax请求并抓取今日头条街拍美图
配置文件config.py
1
2
3
4
5
6
7
|
MONGO_URL
=
'localhost'
MONGO_DB
=
'toutiao'
MONGO_TABLE
=
'toutiao'
GROUP_START
=
1
GROUP_END
=
20
KEYWORD
=
'街拍'
|
主爬虫文件
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
|
import
json
import
os
from
urllib.parse
import
urlencode
import
pymongo
import
requests
from
bs4
import
BeautifulSoup
from
requests.exceptions
import
ConnectionError
import
re
from
multiprocessing
import
Pool
from
hashlib
import
md5
from
json.decoder
import
JSONDecodeError
from
config
import
*
client
=
pymongo.MongoClient(MONGO_URL, connect
=
False
)
# 多进程抓取connect=False
db
=
client[MONGO_DB]
def
get_page_index(offset, keyword):
"""
爬取索引页
"""
data
=
{
'autoload'
:
'true'
,
'count'
:
20
,
'cur_tab'
:
3
,
'format'
:
'json'
,
'keyword'
: keyword,
'offset'
: offset,
}
params
=
urlencode(data)
# 将字典类型构造成url的请求参数
base
=
'http://www.toutiao.com/search_content/'
url
=
base
+
'?'
+
params
try
:
response
=
requests.get(url)
if
response.status_code
=
=
200
:
return
response.text
return
None
except
ConnectionError:
print
(
'Error occurred'
)
return
None
def
download_image(url):
"""
下载图片
"""
print
(
'Downloading'
, url)
try
:
response
=
requests.get(url)
if
response.status_code
=
=
200
:
save_image(response.content)
return
None
except
ConnectionError:
return
None
def
save_image(content):
"""
保存图片
"""
file_path
=
'{0}/{1}.{2}'
.
format
(os.getcwd(), md5(content).hexdigest(),
'jpg'
)
# 用一个md5哈希生成的文件名防止重复
print
(file_path)
if
not
os.path.exists(file_path):
with
open
(file_path,
'wb'
) as f:
f.write(content)
def
parse_page_index(text):
"""
解析数据
"""
try
:
data
=
json.loads(text)
# json字符串转换成字典
if
data
and
'data'
in
data.keys():
for
item
in
data.get(
'data'
):
yield
item.get(
'article_url'
)
except
JSONDecodeError:
pass
def
get_page_detail(url):
"""
请求详情页
"""
try
:
response
=
requests.get(url)
if
response.status_code
=
=
200
:
return
response.text
return
None
except
ConnectionError:
print
(
'Error occurred'
)
return
None
def
parse_page_detail(html, url):
"""
解析详情页
"""
soup
=
BeautifulSoup(html,
'lxml'
)
result
=
soup.select(
'title'
)
title
=
result[
0
].get_text()
if
result
else
''
images_pattern
=
re.
compile
(
'gallery: JSON.parse\("(.*)"\)'
, re.S)
result
=
re.search(images_pattern, html)
if
result:
data
=
json.loads(result.group(
1
).replace(
'\\', '
'))
if
data
and
'sub_images'
in
data.keys():
sub_images
=
data.get(
'sub_images'
)
images
=
[item.get(
'url'
)
for
item
in
sub_images]
for
image
in
images: download_image(image)
return
{
'title'
: title,
'url'
: url,
'images'
: images
}
def
save_to_mongo(result):
"""
将数据插入到MongoDB
"""
if
db[MONGO_TABLE].insert(result):
print
(
'Successfully Saved to Mongo'
, result)
return
True
return
False
def
main(offset):
text
=
get_page_index(offset, KEYWORD)
urls
=
parse_page_index(text)
for
url
in
urls:
html
=
get_page_detail(url)
result
=
parse_page_detail(html, url)
if
result: save_to_mongo(result)
if
__name__
=
=
'__main__'
:
pool
=
Pool()
groups
=
([x
*
20
for
x
in
range
(GROUP_START, GROUP_END
+
1
)])
pool.
map
(main, groups)
pool.close()
pool.join()
|
拉勾网自动投递简历
import requests import re # 1、============================================认证流程 session = requests.session() # 第一步: # 请求的URL:https://passport.lagou.com/login/login.html, # 请求的方法GET, # 请求头只包含User-agent r1 = session.get('https://passport.lagou.com/login/login.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', }, ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # print(X_Anti_Forge_Code) # print(X_Anti_Forge_Token) # 第二步: # 1、请求的URL:https://passport.lagou.com/login/login.json, # 2、请求方法POST, # 3、请求头: # Referer:https://passport.lagou.com/login/login.html # User-Agent: # X-Anit-Forge-Code # X-Anit-Forge-Token # X-Requested-With # 4、请求体: # isValidate:true # username:1111111111 # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714 session.post('https://passport.lagou.com/login/login.json', headers={ 'Referer': 'https://passport.lagou.com/login/login.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'isValidate': True, 'username': '18611453110', 'password': '70621c64832c4d4d66a47be6150b4a8e' } ) # 第三: # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html, # 2、请求方法GET, # 3、请求头: # Referer:https://passport.lagou.com/login/login.html # User-Agent: session.get('https://passport.lagou.com/grantServiceTicket/grant.html', headers={ 'Referer': 'https://passport.lagou.com/login/login.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) # 验证 response = session.get('https://www.lagou.com/resume/myresume.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) # print('18611453110' in response.text) # 2、============================================爬取职位信息 # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json # 2、请求的方式:POST # 请求参数: # gj:3年及以下 # xl:不要求 # jd:不需要融资 # hy:移动互联网 # px:default # yx:15k-25k # city:全国 # 3、请求头: # User-Agent # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD # X-Anit-Forge-Code:0 # X-Anit-Forge-Token:None # X-Requested-With:XMLHttpRequest # 4、请求体: # first:true # pn:1 # kd:python数据分析 from urllib.parse import urlencode params = {'kw': 'python数据分析'} res = urlencode(params).split('=')[-1] url = 'https://www.lagou.com/jobs/list_' + res # print(url) response = session.post('https://www.lagou.com/jobs/positionAjax.json', params={ # 'gj': '3年及以下', # 'xl': '不要求', # 'jd': '不需要融资', # 'hy': '移动互联网', 'px': 'default', 'yx': '15k-25k', 'city': '北京', 'district': '海淀区', }, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'Referer': url, }) # print(response.status_code) result = response.json()['content']['positionResult']['result'] for comanpy_info in result: fullname = comanpy_info['companyFullName'] emp_num = comanpy_info['companySize'] salary = comanpy_info['salary'] workyear = comanpy_info['workYear'] positionName = comanpy_info['positionName'] positionId = comanpy_info['positionId'] detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId) print(detail_url) print(fullname) print(emp_num) print(salary) print(workyear) print(positionName) print(positionId) print() # 3、============================================爬取职位信息 # 第一步:请求详情页: # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html # 2、请求的方式:GET # 3、请求头: # User-Agent r1 = session.get(detail_url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # 第二步:投递简历 # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json # 2、请求的方式:POST # 3、请求头: # User-Agent # Referer:detail_url # X-Anit-Forge-Code:31832262 # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7 # X-Requested-With:XMLHttpRequest # 4、请求体: # 'positionId':3984845 # 'type':1 # 'force':True session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'Referer': detail_url, 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'positionId': positionId, 'type': 1, 'force': True } ) print('投递成功',detail_url) lagou

import requests import re # 1、============================================认证流程 session = requests.session() # 第一步: # 请求的URL:https://passport.lagou.com/login/login.html, # 请求的方法GET, # 请求头只包含User-agent r1 = session.get('https://passport.lagou.com/login/login.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', }, ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # print(X_Anti_Forge_Code) # print(X_Anti_Forge_Token) # 第二步: # 1、请求的URL:https://passport.lagou.com/login/login.json, # 2、请求方法POST, # 3、请求头: # Referer:https://passport.lagou.com/login/login.html # User-Agent: # X-Anit-Forge-Code # X-Anit-Forge-Token # X-Requested-With # 4、请求体: # isValidate:true # username:1111111111 # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714 session.post('https://passport.lagou.com/login/login.json', headers={ 'Referer': 'https://passport.lagou.com/login/login.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'isValidate': True, 'username': '18611453110', 'password': '70621c64832c4d4d66a47be6150b4a8e' } ) # 第三: # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html, # 2、请求方法GET, # 3、请求头: # Referer:https://passport.lagou.com/login/login.html # User-Agent: session.get('https://passport.lagou.com/grantServiceTicket/grant.html', headers={ 'Referer': 'https://passport.lagou.com/login/login.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) # 验证 response = session.get('https://www.lagou.com/resume/myresume.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) # print('18611453110' in response.text) # 2、============================================爬取职位信息 # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json # 2、请求的方式:POST # 请求参数: # gj:3年及以下 # xl:不要求 # jd:不需要融资 # hy:移动互联网 # px:default # yx:15k-25k # city:全国 # 3、请求头: # User-Agent # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD # X-Anit-Forge-Code:0 # X-Anit-Forge-Token:None # X-Requested-With:XMLHttpRequest # 4、请求体: # first:true # pn:1 # kd:python数据分析 from urllib.parse import urlencode params = {'kw': 'python数据分析'} res = urlencode(params).split('=')[-1] url = 'https://www.lagou.com/jobs/list_' + res # print(url) response = session.post('https://www.lagou.com/jobs/positionAjax.json', params={ # 'gj': '3年及以下', # 'xl': '不要求', # 'jd': '不需要融资', # 'hy': '移动互联网', 'px': 'default', 'yx': '15k-25k', 'city': '北京', 'district': '海淀区', }, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'Referer': url, }) # print(response.status_code) result = response.json()['content']['positionResult']['result'] for comanpy_info in result: fullname = comanpy_info['companyFullName'] emp_num = comanpy_info['companySize'] salary = comanpy_info['salary'] workyear = comanpy_info['workYear'] positionName = comanpy_info['positionName'] positionId = comanpy_info['positionId'] detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId) print(detail_url) print(fullname) print(emp_num) print(salary) print(workyear) print(positionName) print(positionId) print() # 3、============================================爬取职位信息 # 第一步:请求详情页: # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html # 2、请求的方式:GET # 3、请求头: # User-Agent r1 = session.get(detail_url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # 第二步:投递简历 # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json # 2、请求的方式:POST # 3、请求头: # User-Agent # Referer:detail_url # X-Anit-Forge-Code:31832262 # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7 # X-Requested-With:XMLHttpRequest # 4、请求体: # 'positionId':3984845 # 'type':1 # 'force':True session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'Referer': detail_url, 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'positionId': positionId, 'type': 1, 'force': True } ) print('投递成功',detail_url)
import requests import re # 1、============================================认证流程 session = requests.session() # 第一步: # 请求的URL:https://passport.lagou.com/login/login.html, # 请求的方法GET, # 请求头只包含User-agent r1 = session.get('https://passport.lagou.com/login/login.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', }, ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # print(X_Anti_Forge_Code) # print(X_Anti_Forge_Token) # 第二步: # 1、请求的URL:https://passport.lagou.com/login/login.json, # 2、请求方法POST, # 3、请求头: # Referer:https://passport.lagou.com/login/login.html # User-Agent: # X-Anit-Forge-Code # X-Anit-Forge-Token # X-Requested-With # 4、请求体: # isValidate:true # username:1111111111 # password:70621c64832c4d4d66a47be6150b4a8e #代表明文密码alex3714 session.post('https://passport.lagou.com/login/login.json', headers={ 'Referer': 'https://passport.lagou.com/login/login.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'isValidate': True, 'username': '18611453110', 'password': '70621c64832c4d4d66a47be6150b4a8e' } ) # 第三: # 1、请求的URL:https://passport.lagou.com/grantServiceTicket/grant.html, # 2、请求方法GET, # 3、请求头: # Referer:https://passport.lagou.com/login/login.html # User-Agent: session.get('https://passport.lagou.com/grantServiceTicket/grant.html', headers={ 'Referer': 'https://passport.lagou.com/login/login.html', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) # 验证 response = session.get('https://www.lagou.com/resume/myresume.html', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) # print('18611453110' in response.text) # 2、============================================爬取职位信息 # 1、请求的url:https://www.lagou.com/jobs/positionAjax.json # 2、请求的方式:POST # 请求参数: # gj:3年及以下 # xl:不要求 # jd:不需要融资 # hy:移动互联网 # px:default # yx:15k-25k # city:全国 # 3、请求头: # User-Agent # Referer:https://www.lagou.com/jobs/list_%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD # X-Anit-Forge-Code:0 # X-Anit-Forge-Token:None # X-Requested-With:XMLHttpRequest # 4、请求体: # first:true # pn:1 # kd:python数据分析 from urllib.parse import urlencode params = {'kw': 'python数据分析'} res = urlencode(params).split('=')[-1] url = 'https://www.lagou.com/jobs/list_' + res # print(url) response = session.post('https://www.lagou.com/jobs/positionAjax.json', params={ # 'gj': '3年及以下', # 'xl': '不要求', # 'jd': '不需要融资', # 'hy': '移动互联网', 'px': 'default', 'yx': '15k-25k', 'city': '北京', 'district': '海淀区', }, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'Referer': url, }) # print(response.status_code) result = response.json()['content']['positionResult']['result'] for comanpy_info in result: fullname = comanpy_info['companyFullName'] emp_num = comanpy_info['companySize'] salary = comanpy_info['salary'] workyear = comanpy_info['workYear'] positionName = comanpy_info['positionName'] positionId = comanpy_info['positionId'] detail_url = 'https://www.lagou.com/jobs/%s.html' % (positionId) print(detail_url) print(fullname) print(emp_num) print(salary) print(workyear) print(positionName) print(positionId) print() # 3、============================================爬取职位信息 # 第一步:请求详情页: # 1、请求的detail_url:https://www.lagou.com/jobs/3984845.html # 2、请求的方式:GET # 3、请求头: # User-Agent r1 = session.get(detail_url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', } ) X_Anti_Forge_Token = re.findall("X_Anti_Forge_Token = '(.*?)'", r1.text, re.S)[0] X_Anti_Forge_Code = re.findall("X_Anti_Forge_Code = '(.*?)'", r1.text, re.S)[0] # 第二步:投递简历 # 1、请求的url:https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json # 2、请求的方式:POST # 3、请求头: # User-Agent # Referer:detail_url # X-Anit-Forge-Code:31832262 # X-Anit-Forge-Token:9ee8b4bc-7107-49a0-a205-cedd7e77c2d7 # X-Requested-With:XMLHttpRequest # 4、请求体: # 'positionId':3984845 # 'type':1 # 'force':True session.post('https://www.lagou.com/mycenterDelay/deliverResumeBeforce.json', headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36', 'Referer': detail_url, 'X-Anit-Forge-Code': X_Anti_Forge_Code, 'X-Anit-Forge-Token': X_Anti_Forge_Token, 'X-Requested-With': 'XMLHttpRequest' }, data={ 'positionId': positionId, 'type': 1, 'force': True } ) print('投递成功',detail_url) lagou