利用python从apk的操作码序列中生成2-gram邻接矩阵
说明
input:对APK文件进行处理获得16进制数表示操作码序列
该16进制是是利用DalvikOpcodes.txt中的操作码隐射字典进行表示的。这些操作码包含了Dalvik虚拟机中定义的200多种操作码,如下:
对操作码序列生成2-gram共生矩阵(不了解该点的百度),完整代码如下:
import re
import os
import operator
import pandas as ps
import numpy as np
from collections import Counter
path = r"D:\test\opseq"
out_path = r"D:test\out"
files = []
for name in os.listdir(path):
if os.path.isfile(os.path.join(path, name)):
files.append(name)
print(files)
dalvik_opcodes = {}
with open("DalvikOpcodes.txt") as fop:
for linee in fop:
(key, val) = linee.split()
dalvik_opcodes[key] = val
def main():
for opseq_hash in files:
file_name = os.path.splitext(opseq_hash)[0]
op = re.compile(r'[0-9a-fA-F]{2}') #16进制
input_opseq = []
with open(path + "\\" + opseq_hash, 'r') as f:
for line in f.readlines():
opcode = re.findall(op,line)
input_opseq.extend(opcode)
list_comb_seq = []
for i in range(len(input_opseq) - 1):
list_comb_seq.append(input_opseq[i] + input_opseq[i + 1]) #构造2-gram
data_count = Counter(list_comb_seq) #统计2-gram组合次数
#将16进制数转化为原操作码形式,可以不转化直接用16进制表示
keys = [x for x in range(218)]
index_map = dict(zip(keys,dalvik_opcodes.values()))
matrix_len = len(index_map)
matrix = np.zeros((matrix_len, matrix_len))
#2-gram共生矩阵
for i in range(matrix_len):
for j in range(matrix_len):
matrix[i][j] = data_count[index_map[i] + index_map[j]]
#2-gram共生矩阵——按行做标准化处理
norm_matrix = np.zeros((matrix_len, matrix_len))
for i in range(matrix_len):
for j in range(matrix_len):
if np.sum(matrix[i]) == 0:
norm_matrix[i][j] = 0
else:
norm_matrix[i][j] = matrix[i][j] / np.sum(matrix[i])
F_Name = os.path.join(out_path, file_name + "_adjacency-matrix.csv")
matrix = ps.DataFrame(matrix)
matrix.fillna(0.0000, inplace=True) #保留小数后四位
#保存到csv
matrix.to_csv(F_Name,float_format='%.4f',header=list(dalvik_opcodes.keys()),index=list(dalvik_opcodes.keys()))
F1_Name = os.path.join(out_path, file_name + "_norm_adjacency-matrix.csv")
norm_matrix = ps.DataFrame(norm_matrix)
norm_matrix.fillna(0.0000, inplace=True) #保留小数后四位
norm_matrix.to_csv(F1_Name,float_format='%.4f',header=list(dalvik_opcodes.keys()),index=list(dalvik_opcodes.keys()))
print("Done")
if __name__ == '__main__':
main()