1.查看网页,分析加密参数
查看其 form data可以发现,其_sig参数为加密参数,经过请求测试,该参数为必要参数,在每一个携带有用数据的ajax请求中都有该参数,其必然是前端JavaScript代码依据某些参数计算后生成的,要想破解,必须要找到其JavaScript中的实现方式,
通过chrome浏览器调试js代码获取该参数生成方式,教程:
https://mp.youkuaiyun.com/postedit/90674760
2.查看js代码发现_sig参数为几个参数组合后采用md5 的方式加密生成,因为计算方式较为简单,就不采用ExexJs调用js代码的方式,而是采用python模拟js的实现方式
实现代码:
from hashlib import md5
class PingAnEncrypt(object):
def __init__(self):
pass
@staticmethod
def index_encrypt(str):
res = md5()
res.update(str.encode('utf8'))
return res.hexdigest()
if __name__ == '__main__':
str = "XXXXXXXXXXXXX"
PingAnEncrypt.index_encrypt(str)
3.破解加密参数后,下一步就是构建模拟请求,逐级获取参数
后续商品子页面请求头中的cookie不是必要参数,为了页面整洁,可以不添加,部分代码:
class PingAnYao(object):
def __init__(self):
self.session = requests.session()
self.list_url = PINGAN_YAO_URL
self.list_headers = PINGAN_YAO_LISTHEADERS
self.goods_headers = PINGAN_YAO_GOODSHEADERS
def list_html(self):
base_sig_param = "_chl=android|WAP_mt=unicorn.getFrontCategory_sm=md5id=10001jk.pingan.com"
sig_param = PingAnEncrypt.index_encrypt(base_sig_param)
list_data= {
"_mt": "unicorn.getFrontCategory",
"id": 10001,
"_chl": "android|WAP",
"_sm": "md5",
"_sig": "{}".format(sig_param),
}
response = self.session.post(url=self.list_url,data=list_data,headers=self.list_headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
json_data = json.loads(response.text)
# with open(r"documents\pingan\index_json.txt",'w+',encoding='utf-8') as f:
# f.write(json.dumps(json_data))
# f.close()
return json_data
else:
print("请求失败")
def analysis_list_html(self,json_data):
"""
解析首页,获取分类参数
:param json_data:
:return:
"""
base_id = jsonpath.jsonpath(json_data,'$.content.0.id')
base_id = str(base_id[0])
for num in range(2,11):
classify_name = jsonpath.jsonpath(json_data, '$.content.0.children.{}.name'.format(num))
classify_id = jsonpath.jsonpath(json_data, '$.content.0.children.{}.id'.format(num))
classify_name = str(classify_name[0])
classify_id = str(classify_id[0])
print(classify_name)
print(classify_id)
self.classify_html(base_id,classify_name,classify_id)
def classify_html(self,base_id,classify_name,classify_id):
"""
解析获取分类信息
:param base_id: 二级分类的id
:param classify_name: 分类名
:param classify_id: 分类id
:return: 商品分类信息,商品id,表名(包含以及分类名,二级分类名,三级分类名)
"""
base_sig_param = "_chl=android|WAP_mt=unicorn.getFirstLevelFrontCategory_sm=md5id={a}rootId={b}jk.pingan.com".format(a=classify_id,b=base_id)
sig_param = PingAnEncrypt.index_encrypt(base_sig_param)
classify = {
"_mt": "unicorn.getFirstLevelFrontCategory",
"rootId":base_id,
"id": classify_id,
"_chl": "android|WAP",
"_sm": "md5",
"_sig": "{}".format(sig_param),
}
response = self.session.post(url=self.list_url, data=classify, headers=self.list_headers)
response.encoding = response.apparent_encoding
if response.status_code == 200:
json_data = json.loads(response.text)
# with open(r"documents\pingan\index_json.txt",'w+',encoding='utf-8') as f:
# f.write(json.dumps(json_data))
# f.close()
name_list, id_list,doc_name_list = self.analysis_classify_html(json_data,classify_name)
large_num = len(name_list)
for num in range(0,large_num):
try:
goods_name = name_list[num]
goods_id = id_list[num]
doc_name = doc_name_list[num]
doc_name = doc_name.replace('/','')
goods_name = str(goods_name)
goods_id = int(goods_id)
print(doc_name)
self.create_table(doc_name)
time.sleep(0.02)
self.goods_html(goods_name,goods_id)
except Exception as e:
pass
else:
print("请求失败")
def analysis_classify_html(self,json_data,classify_name):
"""
解析出各个子类的信息并返回
:param json_data: 包含子类信息的json对象
:return:
"""
name_list = []
id_list = []
second_level_name_list = []
doc_name_list = []
for j in range(0,100):
children_name = jsonpath.jsonpath(json_data, '$.content.0.children.0.children.{}.name'.format(j))
children_id = jsonpath.jsonpath(json_data, '$.content.0.children.0.children.{}.id'.format(j))
second_level_name = jsonpath.jsonpath(json_data,'$.content.0.children.0.name')
if children_name is not False:
name_list += children_name
id_list += children_id
second_level_name_list += second_level_name
doc_name = "{}_{}_{}".format(classify_name, str(second_level_name[0]), str(children_name[0]))
doc_name_list.append(doc_name)
else:
pass
for i in range(1,100):
for j in range(0,100):
children_name = jsonpath.jsonpath(json_data,'$.content.0.children.{}.children.{}.name'.format(i,j))
children_id = jsonpath.jsonpath(json_data,'$.content.0.children.{}.children.{}.id'.format(i,j))
second_level_name = jsonpath.jsonpath(json_data, '$.content.0.children.{}.name'.format(i))
# children_img = jsonpath.jsonpath(json_data,'$.content.0.children.{}.children.0.img'.format(i))
if children_name is False:
break
name_list += children_name
id_list +=children_id
second_level_name_list += second_level_name
doc_name = "{}_{}_{}".format(classify_name,str(second_level_name[0]),str(children_name[0]))
doc_name_list.append(doc_name)
# print(doc_name)
return name_list,id_list,doc_name_list
4.获取到数据后,下一步就是存储到数据库,因为产品类别很多,而且又有分类存储的需求,所以选择自动创建数据表,自动存储的方式,代码:
try:
import pymysql
except:
print('module "pymysql" is not avalable for your recent system corcustance')
pass
import copy
from PingAn.config import *
class CreateTable():
"""
连接数据库并且封装了create方法,可实现数据表创建
:param host : 服务器地址
:param port : 数据库端口
:param user : 用户名
:param password : 密码
:param dbname : 数据库名
:param mode : 连接方式,1表示连接mysql,2表示连接sqlserver
"""
def __init__(self, db_name, tb_name, mode=None, host=HOST, port=PORT, username=USER_NAME, password=PASSWORD,charset=CHARSET, **kwargs):
self.db_config = copy.deepcopy(locals())
self.mode = mode
self.db_name = db_name
self.tb_name = tb_name
self.host = host
self.port = port
self.username = username
self.password = password
self.charset = charset
self.conn = self.get_conn()
self.cursor = self.conn.cursor()
def get_conn(self):
"""
连接数据库,mode=1为连接mysql,m其他情况为连接sqlserver
:return: 连接
"""
conn = pymysql.connect(host=self.host,port=self.port,user=self.username,password=self.password,charset=self.charset,database=self.db_name)
return conn
def close(self):
self.cursor.close()
self.conn.close()
def create_pingan_table(self):
'''
创建存储平安好医生商品信息的数据表
:return:
'''
try:
sql_1 = "create table {a}(id int auto_increment primary key,goods_name varchar(160) not null,sales_volume int(30) not null,price int(30),product_id int(30),image_url varchar(150),goods_url varchar(150));".format(a=self.tb_name)
# "goods_name,view_count,mall_price,image_url"
self.cursor.execute(sql_1)
self.conn.commit()
print("创建成功")
res = self.cursor.fetchall()
self.close()
if res != None:
self.close()
return res
except:
self.conn.rollback()
class DatabaseSql():
"""
连接数据库并且封装了insert方法,后续可扩展实现mysql及sqlserver的同时连接
:param host : 服务器地址
:param port : 数据库端口
:param user : 用户名
:param password : 密码
:param dbname : 数据库名
:param mode : 连接方式,1表示连接mysql,2表示连接sqlserver
"""
def __init__(self,db_name,tb_name,mode=1,host=HOST,port=PORT,username=USER_NAME,password=PASSWORD,charset=CHARSET,**kwargs):
self.db_config = copy.deepcopy(locals())
self.mode = mode
self.db_name = db_name
self.tb_name = tb_name
self.host = host
self.port = port
self.username = username
self.password = password
self.charset = charset
self.conn = self.get_conn()
self.cursor = self.conn.cursor()
def get_conn(self):
"""
连接数据库,mode=1为连接mysql,m其他情况为连接sqlserver
:return: 连接
"""
if self.mode == 1:
conn = pymysql.connect(host=self.host,port=self.port,user=self.username,password=self.password,charset=self.charset,database=self.db_name)
return conn
else:
pass
def set_pingan_data(self,goods_name,sales_volume,price,product_id,image_url,goods_url):
'''
向数据库存储数据
:return:
'''
# self.connect()
goods_name= str(goods_name)
sales_volume= str(sales_volume)
price= str(price)
product_id= str(product_id)
image_url= str(image_url)
goods_url= str(goods_url)
try:
sql_1 = "INSERT INTO {} VALUES(null,'{}','{}','{}','{}','{}','{}');".format(self.tb_name,goods_name,sales_volume,price,product_id,image_url,goods_url)
self.cursor.execute(sql_1)
self.conn.commit()
print("添加成功===============>>>>>>>")
res = self.cursor.fetchall()
if res != None:
self.close()
return res
except:
self.conn.rollback()
def close(self):
self.cursor.close()
self.conn.close()
5.至此,整个商城的数据爬取工作就告一段落了,接下来的主要工作是对数据进行清洗和分析.