模拟js加密参数,爬取某商城数据

1.查看网页,分析加密参数

查看其 form data可以发现,其_sig参数为加密参数,经过请求测试,该参数为必要参数,在每一个携带有用数据的ajax请求中都有该参数,其必然是前端JavaScript代码依据某些参数计算后生成的,要想破解,必须要找到其JavaScript中的实现方式,

通过chrome浏览器调试js代码获取该参数生成方式,教程:

https://mp.youkuaiyun.com/postedit/90674760

2.查看js代码发现_sig参数为几个参数组合后采用md5 的方式加密生成,因为计算方式较为简单,就不采用ExexJs调用js代码的方式,而是采用python模拟js的实现方式

实现代码:

from hashlib import md5


class PingAnEncrypt(object):
    def __init__(self):
        pass

    @staticmethod
    def index_encrypt(str):
        res = md5()
        res.update(str.encode('utf8'))
        return res.hexdigest()


if __name__ == '__main__':
    str = "XXXXXXXXXXXXX"
    PingAnEncrypt.index_encrypt(str)

3.破解加密参数后,下一步就是构建模拟请求,逐级获取参数

后续商品子页面请求头中的cookie不是必要参数,为了页面整洁,可以不添加,部分代码:

class PingAnYao(object):
    def __init__(self):
        self.session = requests.session()
        self.list_url = PINGAN_YAO_URL
        self.list_headers = PINGAN_YAO_LISTHEADERS
        self.goods_headers = PINGAN_YAO_GOODSHEADERS

    def list_html(self):
        base_sig_param = "_chl=android|WAP_mt=unicorn.getFrontCategory_sm=md5id=10001jk.pingan.com"
        sig_param = PingAnEncrypt.index_encrypt(base_sig_param)
        list_data= {
            "_mt": "unicorn.getFrontCategory",
            "id": 10001,
            "_chl": "android|WAP",
            "_sm": "md5",
            "_sig": "{}".format(sig_param),
        }
        response = self.session.post(url=self.list_url,data=list_data,headers=self.list_headers)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            json_data = json.loads(response.text)
            # with open(r"documents\pingan\index_json.txt",'w+',encoding='utf-8') as f:
            #     f.write(json.dumps(json_data))
            #     f.close()
            return json_data
        else:
            print("请求失败")

    def analysis_list_html(self,json_data):
        """
        解析首页,获取分类参数
        :param json_data:
        :return:
        """
        base_id = jsonpath.jsonpath(json_data,'$.content.0.id')
        base_id = str(base_id[0])
        for num in range(2,11):
            classify_name = jsonpath.jsonpath(json_data, '$.content.0.children.{}.name'.format(num))
            classify_id = jsonpath.jsonpath(json_data, '$.content.0.children.{}.id'.format(num))
            classify_name = str(classify_name[0])
            classify_id = str(classify_id[0])
            print(classify_name)
            print(classify_id)
            self.classify_html(base_id,classify_name,classify_id)

    def classify_html(self,base_id,classify_name,classify_id):
        """
        解析获取分类信息
        :param base_id: 二级分类的id
        :param classify_name: 分类名
        :param classify_id: 分类id
        :return: 商品分类信息,商品id,表名(包含以及分类名,二级分类名,三级分类名)
        """
        base_sig_param = "_chl=android|WAP_mt=unicorn.getFirstLevelFrontCategory_sm=md5id={a}rootId={b}jk.pingan.com".format(a=classify_id,b=base_id)
        sig_param = PingAnEncrypt.index_encrypt(base_sig_param)
        classify = {
            "_mt": "unicorn.getFirstLevelFrontCategory",
            "rootId":base_id,
            "id": classify_id,
            "_chl": "android|WAP",
            "_sm": "md5",
            "_sig": "{}".format(sig_param),
        }
        response = self.session.post(url=self.list_url, data=classify, headers=self.list_headers)
        response.encoding = response.apparent_encoding
        if response.status_code == 200:
            json_data = json.loads(response.text)
            # with open(r"documents\pingan\index_json.txt",'w+',encoding='utf-8') as f:
            #     f.write(json.dumps(json_data))
            #     f.close()
            name_list, id_list,doc_name_list = self.analysis_classify_html(json_data,classify_name)
            large_num = len(name_list)
            for num in range(0,large_num):
                try:
                    goods_name = name_list[num]
                    goods_id = id_list[num]
                    doc_name = doc_name_list[num]
                    doc_name = doc_name.replace('/','')
                    goods_name = str(goods_name)
                    goods_id = int(goods_id)
                    print(doc_name)
                    self.create_table(doc_name)
                    time.sleep(0.02)
                    self.goods_html(goods_name,goods_id)
                except Exception as e:
                    pass
        else:
            print("请求失败")

    def analysis_classify_html(self,json_data,classify_name):
        """
        解析出各个子类的信息并返回
        :param json_data: 包含子类信息的json对象
        :return:
        """
        name_list = []
        id_list = []
        second_level_name_list = []
        doc_name_list = []
        for j in range(0,100):
            children_name = jsonpath.jsonpath(json_data, '$.content.0.children.0.children.{}.name'.format(j))
            children_id = jsonpath.jsonpath(json_data, '$.content.0.children.0.children.{}.id'.format(j))
            second_level_name = jsonpath.jsonpath(json_data,'$.content.0.children.0.name')
            if children_name is not False:
                name_list += children_name
                id_list += children_id
                second_level_name_list += second_level_name
                doc_name = "{}_{}_{}".format(classify_name, str(second_level_name[0]), str(children_name[0]))
                doc_name_list.append(doc_name)
            else:
                pass
        for i in range(1,100):
            for j in range(0,100):
                children_name = jsonpath.jsonpath(json_data,'$.content.0.children.{}.children.{}.name'.format(i,j))
                children_id = jsonpath.jsonpath(json_data,'$.content.0.children.{}.children.{}.id'.format(i,j))
                second_level_name = jsonpath.jsonpath(json_data, '$.content.0.children.{}.name'.format(i))
                # children_img = jsonpath.jsonpath(json_data,'$.content.0.children.{}.children.0.img'.format(i))
                if children_name is False:
                    break
                name_list += children_name
                id_list +=children_id
                second_level_name_list += second_level_name
                doc_name = "{}_{}_{}".format(classify_name,str(second_level_name[0]),str(children_name[0]))
                doc_name_list.append(doc_name)
                # print(doc_name)
        return name_list,id_list,doc_name_list

4.获取到数据后,下一步就是存储到数据库,因为产品类别很多,而且又有分类存储的需求,所以选择自动创建数据表,自动存储的方式,代码:

try:
    import pymysql
except:
    print('module "pymysql" is not avalable for your recent system corcustance')
    pass
import copy
from PingAn.config import *


class CreateTable():
    """
        连接数据库并且封装了create方法,可实现数据表创建
        :param host         :       服务器地址
        :param port         :       数据库端口
        :param user         :       用户名
        :param password     :       密码
        :param dbname       :       数据库名
        :param mode         :       连接方式,1表示连接mysql,2表示连接sqlserver
        """
    def __init__(self, db_name, tb_name, mode=None, host=HOST, port=PORT, username=USER_NAME, password=PASSWORD,charset=CHARSET, **kwargs):
        self.db_config = copy.deepcopy(locals())
        self.mode = mode
        self.db_name = db_name
        self.tb_name = tb_name
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self.charset = charset
        self.conn = self.get_conn()
        self.cursor = self.conn.cursor()

    def get_conn(self):
        """
        连接数据库,mode=1为连接mysql,m其他情况为连接sqlserver
        :return: 连接
        """
        conn = pymysql.connect(host=self.host,port=self.port,user=self.username,password=self.password,charset=self.charset,database=self.db_name)
        return conn

    def close(self):
        self.cursor.close()
        self.conn.close()

    def create_pingan_table(self):
        '''
        创建存储平安好医生商品信息的数据表
        :return:
        '''
        try:
            sql_1 = "create table {a}(id int auto_increment primary key,goods_name varchar(160) not null,sales_volume int(30) not null,price int(30),product_id int(30),image_url varchar(150),goods_url varchar(150));".format(a=self.tb_name)
            # "goods_name,view_count,mall_price,image_url"
            self.cursor.execute(sql_1)
            self.conn.commit()
            print("创建成功")
            res = self.cursor.fetchall()
            self.close()
            if res != None:
                self.close()
                return res
        except:
            self.conn.rollback()

class DatabaseSql():
    """
    连接数据库并且封装了insert方法,后续可扩展实现mysql及sqlserver的同时连接
    :param host         :       服务器地址
    :param port         :       数据库端口
    :param user         :       用户名
    :param password     :       密码
    :param dbname       :       数据库名
    :param mode         :       连接方式,1表示连接mysql,2表示连接sqlserver
    """
    def __init__(self,db_name,tb_name,mode=1,host=HOST,port=PORT,username=USER_NAME,password=PASSWORD,charset=CHARSET,**kwargs):
        self.db_config = copy.deepcopy(locals())
        self.mode = mode
        self.db_name = db_name
        self.tb_name = tb_name
        self.host = host
        self.port = port
        self.username = username
        self.password = password
        self.charset = charset
        self.conn = self.get_conn()
        self.cursor = self.conn.cursor()


    def get_conn(self):
        """
        连接数据库,mode=1为连接mysql,m其他情况为连接sqlserver
        :return: 连接
        """
        if self.mode == 1:
            conn = pymysql.connect(host=self.host,port=self.port,user=self.username,password=self.password,charset=self.charset,database=self.db_name)
            return conn
        else:
            pass


    def set_pingan_data(self,goods_name,sales_volume,price,product_id,image_url,goods_url):
        '''
        向数据库存储数据
        :return:
        '''
        # self.connect()
        goods_name= str(goods_name)
        sales_volume= str(sales_volume)
        price= str(price)
        product_id= str(product_id)
        image_url= str(image_url)
        goods_url= str(goods_url)

        try:
            sql_1 = "INSERT INTO {} VALUES(null,'{}','{}','{}','{}','{}','{}');".format(self.tb_name,goods_name,sales_volume,price,product_id,image_url,goods_url)
            self.cursor.execute(sql_1)
            self.conn.commit()
            print("添加成功===============>>>>>>>")
            res = self.cursor.fetchall()
            if res != None:
                self.close()
                return res
        except:
            self.conn.rollback()

    def close(self):
        self.cursor.close()
        self.conn.close()
        

5.至此,整个商城的数据爬取工作就告一段落了,接下来的主要工作是对数据进行清洗和分析.

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值