直接上代码
自己运行时,将cookie修改为自己的cookie值,key修改为要爬取的值
import requests
import json
from datetime import date, timedelta
import pandas as pd
from odps import ODPS
import time
key = ['特斯拉']
cookie = '自己百度账号登录后的cookie'
class DownloadBaiDuIndex(object):
def __init__(self, cookie):
self.cookie = cookie
self.headers = {
"Connection": "keep-alive",
"Accept": "application/json, text/plain, */*",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"Sec-Fetch-Site": "same-origin",
"Sec-Fetch-Mode": "cors",
"Sec-Fetch-Dest": "empty",
"Referer": "https://index.baidu.com/v2/main/index.html",
"Accept-Language": "zh-CN,zh;q=0.9",
'Cookie': self.cookie,
"Host": "index.baidu.com",
"X-Requested-With": "XMLHttpRequest",
"Cipher-Text": "1656572408684_1656582701256_Nvm1pABkNsfD7V9VhZSzzFiFKylr3l5NR3YDrmHmH9yfFicm+Z9kmmwKVqVV6unvzAEh5hgXmgelP+OyOeaK8F21LyRVX1BDjxm+ezsglwoe1yfp6lEpuvu5Iggg1dz3PLF8e2II0e80ocXeU0jQFBhSbnB2wjhKl57JggTej12CzuL+h9eeVWdaMO4DSBWU2XX6PfbN8pv9+cdfFhVRHCzb0BJBU3iccoFczwNQUvzLn0nZsu0YPtG5DxDkGlRlZrCfKMtqKAe1tXQhg3+Oww4N3CQUM+6A/tKZA7jfRE6CGTFetC7QQyKlD7nxabkQ5CReAhFYAFAVYJ+sEqmY5pke8s3+RZ6jR7ASOih6Afl35EArbJzzLpnNPgrPCHoJiDUlECJveul7P5vvXl/O/Q==",
}
def decrypt(self, ptbk, index_data):
n = len(ptbk) // 2
a = dict(zip(ptbk[:n], ptbk[n:]))
return "".join([a[s] for s in index_data])
def get_index_data_json(self,city, keys, start=None, end=None):
words = [[{"name": key, "wordType": 1}] for key in keys]
words = str(words).replace(" ", "").replace("'", "\"")
url = f'http://index.baidu.com/api/SearchApi/index?area={city}&word={words}&startDate={start}&endDate={end}'
res = requests.get(url, headers=self.headers)
data = res.json()['data']
uniqid = data['uniqid']
url = f'http://index.baidu.com/Interface/ptbk?uniqid={uniqid}'
res = requests.get(url, headers=self.headers)
ptbk = res.json()['data']
result = {}
result["startDate"] = start
result["endDate"] = end
for userIndexe in data['userIndexes']:
name = userIndexe['word'][0]['name']
tmp = {}
index_all = userIndexe['all']['data']
index_all_data = [str(e) for e in self.decrypt(ptbk, index_all).split(",")]
tmp["all"] = index_all_data
index_pc = userIndexe['pc']['data']
index_pc_data = [str(e) for e in self.decrypt(ptbk, index_pc).split(",")]
tmp["pc"] = index_pc_data
index_wise = userIndexe['wise']['data']
index_wise_data = [str(e)
for e in self.decrypt(ptbk, index_wise).split(",")]
tmp["wise"] = index_wise_data
result[name] = tmp
return result
def GetIndex(self,city, keys, start=None, end=None):
today = date.today()
if start is None:
start = str(today - timedelta(days=8))
if end is None:
end = str(today - timedelta(days=2))
try:
raw_data = self.get_index_data_json(city=city,keys=keys, start=start, end=end)
raw_data = pd.DataFrame(raw_data[keys[0]])
raw_data.index = pd.date_range(start=start, end=end)
tmp_str = ''
for index, row in raw_data.iterrows(): # 按行遍历
print(str(row['all']))
print(str(row['wise']))
print(str(row['pc']))
time.sleep(1)
except Exception as e:
print(e)
raw_data = pd.DataFrame({'all': [], 'pc': [], 'wise': []})
finally:
return raw_data
# 初始化一个类
downloadbaiduindex = DownloadBaiDuIndex(cookie=cookie)
city_d = [95,94,133,195,196,197,198,199,200,201,202,203,204,205,207,208,209,210,211,212,213,168,262,263,264,265,266,268,370,371,373,374,375,376,378,379,380,381,667,97,96,98,99,100,101,102,103,104,106,107,108,109,111,112,113,114,291,417,457,479,125,126,127,156,157,158,159,160,161,162,163,169,172,28,30,31,32,33,34,35,36,37,38,39,40,41,42,73,74,687,138,134,135,149,287,288,289,303,304,305,306,50,51,52,53,54,55,56,87,253,152,153,295,297,300,301,302,319,320,322,323,324,359,1,76,77,78,79,80,81,82,83,84,85,86,88,352,353,356,366,165,271,272,273,274,275,276,277,278,401,141,143,144,145,146,147,148,259,261,292,293,150,29,151,215,216,217,218,219,220,221,222,223,224,225,154,155,191,194,270,407,408,410,525,117,123,124,334,335,337,339,342,350,437,438,666,668,669,671,672,467,280,310,311,312,315,317,318,383,384,386,499,520,563,653,661,692,693,90,89,91,92,93,118,119,128,129,130,131,132,506,665,231,227,228,229,230,232,233,234,235,236,237,43,44,45,46,47,48,49,65,66,67,68,226,269,405,5,6,7,8,9,10,115,136,137,246,256,189,173,174,175,176,177,178,179,181,182,183,184,185,186,187,188,391,20,13,14,15,16,17,19,21,22,25,331,333,166,281,282,283,284,285,286,307,308,309,343,344,346,673,239,241,242,243,244,456,582,670,674,675,679,680,681,683,684,686,689,690,2,3,4,59,61,422,424,426,588,140,395,396,472,480,139,608,652,659,676,682,685,688,466,516,655,656,677,678,691,911,910,904,923]
#city_d = [95,94,133,195,196,197]
for i in city_d:
data = downloadbaiduindex.GetIndex(city=str(i),keys=key, start='2022-01-01', end='2022-09-12')
城市编码对应城市名称见:https://blog.youkuaiyun.com/qq_38524532/article/details/126874267
有个惊喜,爬取多了,百度账号会在百度指数那块被封,查不到任何数据