最近在弄爬虫,有些网站需要传入headers,自己将网站的 headers 一一弄下来形成字典太麻烦了。偶然发现 Chrome 可以生成一个叫 cURL 的东西,里面包含该网页的 headers 。就写了个函数自动提取并返回字典形式的 headers。
先介绍一下怎么获取到网页的 cURL:
按上面步骤执行就能得到下面的东西:
curl "https://www.baidu.com/" ^
-H "Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9" ^
-H "Accept-Language: zh-CN,zh;q=0.9" ^
-H "Cache-Control: no-cache" ^
-H "Connection: keep-alive" ^
-H "Cookie: BIDUPSID=CF2E815137707285512CD3252D8DDEE1; PSTM=1630408809; BAIDUID=0E83103611A0E0DF4EE7196D38446655:SL=0:NR=20:FG=1; BDUSS=1RnVDZnSDc1RkhnOGpMWWxwS35SeUV0Z09nbXRnTXEtMFhobXAxaFkzdDVsWk5tSVFBQUFBJCQAAAAAAAAAAAEAAADPVpQ38575eQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHkIbGZ5CGxmM; BDUSS_BFESS=1RnVDZnSDc1RkhnOGpMWWxwS35SeUV0Z09nbXRnTXEtMFhobXAxaFkzdDVsWk5tSVFBQUFBJCQAAAAAAAAAAAEAAADPVpQ38575eQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHkIbGZ5CGxmM; BD_UPN=12314753; newlogin=1; H_PS_PSSID=60274_60853_60887_60875; H_WISE_SIDS=60274_60853_60887_60875; H_WISE_SIDS_BFESS=60274_60853_60887_60875; MCITY=-163^%^3A; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; BD_CK_SAM=1; PSINO=3; BAIDUID_BFESS=0E83103611A0E0DF4EE7196D38446655:SL=0:NR=20:FG=1; Hm_lvt_aec699bb6442ba076c8981c6dc490771=1729689873; Hm_lpvt_aec699bb6442ba076c8981c6dc490771=1729689873; HMACCOUNT=B37F5B70AA875D58; BA_HECTOR=8404ag8l0l0h0h81ah2la4a00dpf5c1jhhu8l1v; ZFY=G3nKhL:ANfffFHevk2kouguWbEucWfCyZpHpz1iLakIQ:C; BDRCVFR^[Zh1eoDf3ZW3^]=mk3SLVN4HKm; sug=3; sugstore=0; ORIGIN=0; bdime=0; H_PS_645EC=eb43^%^2BrAJdQk5QAx^%^2FVRCKlW8aIw1QS^%^2B9cy^%^2F4SdYnWgLQUFFMo1BvsGJdNXpdGj7^%^2BSK14GeA; B64_BOT=1" ^
-H "Pragma: no-cache" ^
-H "Referer: https://www.baidu.com/s?tn=15007414_9_dg&ie=utf-8&wd=baidu.com" ^
-H "Sec-Fetch-Dest: document" ^
-H "Sec-Fetch-Mode: navigate" ^
-H "Sec-Fetch-Site: same-origin" ^
-H "Sec-Fetch-User: ?1" ^
-H "Upgrade-Insecure-Requests: 1" ^
-H "User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" ^
-H "sec-ch-ua: ^\^"Not_A Brand^\^";v=^\^"99^\^", ^\^"Google Chrome^\^";v=^\^"109^\^", ^\^"Chromium^\^";v=^\^"109^\^"" ^
-H "sec-ch-ua-mobile: ?0" ^
-H "sec-ch-ua-platform: ^\^"Windows^\^"" ^
--compressed
下面是处理代码:
import re
import json
from urllib.parse import unquote
def curl_to_headers(curl):
curl = re.sub(r' \^|\^\\\\|\\\\\^|\^|\\|--compressed|--insecure', '', curl)
data = None
if 'data-raw' in curl:
data = curl.split('data-raw')[-1].strip()
curl = curl.split('data-raw')[0]
data = re.sub(r'\\', '', data[1:-1])
if "&" not in data:
data = json.dumps(json.loads(data), ensure_ascii=False, indent=4)
else:
split_list = data.split('&')
data = {i.split('=')[0]: i.split('=')[1] for i in split_list}
data = json.dumps(data, ensure_ascii=False, indent=4)
curl = re.sub('"', '', curl)
curl = curl.split(' -H')
headers = {}
for i in curl[1:]:
s = re.match('(.*?):(.*)', i.strip())
headers[s.group(1).strip()] = s.group(2).strip()
url = curl[0].split('curl')[-1].strip().split('?')[0]
params = None
if "?" in curl[0].split('curl')[-1].strip():
params = curl[0].split('curl')[-1].strip().split('?')[1]
params = {i.split('=')[0]:unquote(i.split('=')[1]) for i in params.split('&')}
params = json.dumps(params, ensure_ascii=False, indent=4)
headers = json.dumps(headers, ensure_ascii=False, indent=4)
if data is not None:
text = f'''url = r'{url}'\nheaders = {headers}\ndata = {data}\nresp = requests.post(url=url,headers=headers,data=data)'''
elif params is not None:
text = f'''url = r'{url}'\nheaders = {headers}\nparams = {params}\nresp = requests.get(url=url,headers=headers,params=params)'''
else:
text = f'''url = r'{url}'\nheaders = {headers}\nresp = requests.get(url=url,headers=headers)'''
print(text)
return headers
运行结果如下:
url = r'https://www.baidu.com/'
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Accept-Language": "zh-CN,zh;q=0.9",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Cookie": "BIDUPSID=CF2E815137707285512CD3252D8DDEE1; PSTM=1630408809; BAIDUID=0E83103611A0E0DF4EE7196D38446655:SL=0:NR=20:FG=1; BDUSS=1RnVDZnSDc1RkhnOGpMWWxwS35SeUV0Z09nbXRnTXEtMFhobXAxaFkzdDVsWk5tSVFBQUFBJCQAAAAAAAAAAAEAAADPVpQ38575eQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHkIbGZ5CGxmM; BDUSS_BFESS=1RnVDZnSDc1RkhnOGpMWWxwS35SeUV0Z09nbXRnTXEtMFhobXAxaFkzdDVsWk5tSVFBQUFBJCQAAAAAAAAAAAEAAADPVpQ38575eQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAHkIbGZ5CGxmM; BD_UPN=12314753; newlogin=1; H_PS_PSSID=60274_60853_60887_60875; H_WISE_SIDS=60274_60853_60887_60875; H_WISE_SIDS_BFESS=60274_60853_60887_60875; MCITY=-163%3A; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; delPer=0; BD_CK_SAM=1; PSINO=3; BAIDUID_BFESS=0E83103611A0E0DF4EE7196D38446655:SL=0:NR=20:FG=1; Hm_lvt_aec699bb6442ba076c8981c6dc490771=1729689873; Hm_lpvt_aec699bb6442ba076c8981c6dc490771=1729689873; HMACCOUNT=B37F5B70AA875D58; BA_HECTOR=8404ag8l0l0h0h81ah2la4a00dpf5c1jhhu8l1v; ZFY=G3nKhL:ANfffFHevk2kouguWbEucWfCyZpHpz1iLakIQ:C; BDRCVFR[Zh1eoDf3ZW3]=mk3SLVN4HKm; sug=3; sugstore=0; ORIGIN=0; bdime=0; H_PS_645EC=eb43%2BrAJdQk5QAx%2FVRCKlW8aIw1QS%2B9cy%2F4SdYnWgLQUFFMo1BvsGJdNXpdGj7%2BSK14GeA; B64_BOT=1",
"Pragma": "no-cache",
"Referer": "https://www.baidu.com/s?tn=15007414_9_dg&ie=utf-8&wd=baidu.com",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36",
"sec-ch-ua": "Not_A Brand;v=99,Google Chrome;v=109,Chromium;v=109",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "Windows"
}
resp = requests.get(url=url,headers=headers)