突然想起,这个脚本完善之后几个月了,一直忘了发出来,虽然没什么人看,但至少有始有终吧,在之前发过的两个版本基础之上优化了性能问题。
代码就三四百行吧,看得懂看不懂就这样了。。。。。。
from __future__ import division
import re,datetime
class PatternMerge:
'''
参数一(int):多参数数据合并时根据后几个参数合并,最小为2
参数二(int):单参数合并时多少条以上才进行合并
参数三(int):多参数合并时多少条以上才进行合并
参数四(float):判断参数是否为真,去重后的数据量/去重前的数据量>参数四才会判断该参数为真,区间[0-1]
参数五(method):回调函数
'''
def __init__(self, number, size, num, probability, on_merge_func):
self.number = number
self.size = size
self.num = num
self.probability = probability
self.patternList = {}
self.patternsDict = {}
self.patternData = []
self.on_merge_func = on_merge_func
'''
将包含数字的段替换为“{id1},{id2},{id3}...”
'''
def sub(self, data):
list = re.findall('[/|;][^/,";]*\d[^/,";]*', data)
for i in range(len(list)):
data = data.replace(list[i], list[i][0] + "{id" + str(i + 1) + "}", 1)
return data
'''
转义数据中的特殊字符
'''
def escape_regexp_char(self, value):
regexp_special_chars = [u'\\', u'^', u'$', u'*', u'+', u'?', u'.', u'|', u'(', u')', u'[', u']']
escape_char = u'\\'
for char in regexp_special_chars:
value = value.replace(char, escape_char + char)
return value
'''
将传入的数据转换为正则表达式返回,例:
传入"/api/{id1},GET,40",返回"/api/[^/,]*\d[^/,]*,GET,40"
'''
def getRef(self, data):
data1 = self.escape_regexp_char(data)
reString = re.compile('[/|;][^/,";]*\d[^/,";]*').sub("[/|;][^/,\";]*\\\d[^/,\";]*", data1)
return "^" + reString + "$"
def getRe(self, data):
data1 = self.escape_regexp_char(data)
reString = re.compile('[/|;]{id\d+}[^/,";]*').sub("[/|;][^/,\";]*\\\d[^/,\";]*", data1)
return "^" + reString + "$"
'''
将数据转换为指定格式,以此判断参数是否为真,例:
传入'/api/57/login/3/123',传出'/api/{id}/login/3/{id}'
'''
def getDataString(self, data, num):
# 存放转换后的数据
dataString = ""
# 计数,和num做对比
datanum = 1
# 根据包含数字的部分拆分数据
list = re.split("([/|;][^/,\";]*\d[^/,\";]*)", data)
# 遍历拆分后的列表
for i in list:
# 判断是否包含数字,不包含直接和dataString拼接
if re.compile("[/|;][^/,\";]*\d[^/,\";]*").findall(i):
# 判断是否和num相同,相同则不转换为“/{id}”,不相同则转换为“/{id}”
if datanum == num:
dataString = dataString + i
else:
dataString = dataString + "/{id}"
# 计数+1
datanum = datanum + 1
else:
dataString = dataString + i
return dataString
'''
将数据转换为指定格式,和getDataString()类似,例:
根据传入的idList中记录的下标,传入'/api/57/login/3/123',传出'/api/57/login/{id1}/{id2}'
'''
def getPattern(self, data, idList):
# 存放转换后的数据
dataString = ""
# 计数,判断idList中包不包含当前下标的段
datanum1 = 1
# 计数,统计当前数据有几个“{id}”
datanum2 = 1
# 根据包含数字的部分拆分数据
list = re.split("([/|;][^/,\";]*\d[^/,\";]*)", data)
# 遍历拆分后的列表
for i in list:
# 判断当前段是否包含数字,如果不包含直接和dataString拼接
if re.compile('[/|;][^/,";]*\d[^/,";]*').findall(i):
# 如果当前下标的段在idList中,则表示当前包含数字的段非真正的“{id}”;不在idList中,就拼接“{id}”
if datanum1 in idList:
dataString = dataString + i[0] + "{id" + str(datanum2) + "}"
datanum2 = datanum2 + 1
else:
dataString = dataString + i
# 计数+1
datanum1 = datanum1 + 1
else:
dataString = dataString + i
return dataString
'''
pattern分组,划分为:
(1)1个段包含数字;
(2)N+1个段包含数字。
数据存入patternsDict、patternList
'''
def data_group(self, content):
for lines in content:
'''
截取出包含数字段的部分放在apiList列表中,根据列表的length划分为两组;
取出原始数据中的path、host、port、method等字段,用“,”分割合并为字符串,这么做的目的是抛开match做合并,因为需要合并的数据的match会不一样
'''
apiList = re.findall('[/|;][^/,";]*\d[^/,";]*', lines['path'])
line = lines['path'] + "," + lines['host'] + "," + str(lines['port']) + "," + lines['method']
if len(apiList) >= 2:
api = self.sub(line)
if self.patternsDict.get(api) == None:
self.patternsDict[api] = [lines]
else:
self.patternsDict[api].append(lines)
elif len(apiList) == 1:
api = self.sub(line)
if self.patternList.get(api) == None:
self.patternList[api] = [lines]
else:
self.patternList[api].append(lines)
else:
lines['type'] = 2
self.patternData.append(lines)
'''
处理一个参数的数据,将符合条件的合并后的api放入patternData;
如果合并api的数据量小于或等于size,则不放入patternData;
'''
def data_one_merge(self):
for key in self.patternList.keys():
if len(self.patternList[key]) > self.size:
data = {}
patternsurl = key.split(",")
data['path'] = patternsurl[0]
data['host'] = patternsurl[1]
data['port'] = patternsurl[2]
data['method'] = patternsurl[3]
data['type'] = 1
if self.on_merge_func != None:
self.on_merge_func(data, self.patternList[key])
self.patternData.append(data)
else:
for data in self.patternList[key]:
data['type'] = 2
self.patternData.append(data)
'''
API合并的调用方法
参数(List):数据列表,要求元素为字典,字典字段:{'path':string,'host':string,'port':string,'match':float,'method':string}
'''
def merge(self, content):
# pattern分组
self.data_group(content)
# 处理一个参数的api
if len(self.patternList) > 0:
self.data_one_merge()
'''
处理N+1个段包含数据的api
'''
for keyDict in self.patternsDict.keys():
'''
如果 len(self.patternsDict[keyDict]) 小于等于2,说明当前pattern由最多两条数据生成,就不再做合并计算
'''
if len(self.patternsDict[keyDict]) <= self.num:
for data in self.patternsDict[keyDict]:
data['type'] = 2
self.patternData.append(data)
else:
'''
dataList存放转换后的api,用来依次判断某个参数是否为真的参数,例:
[
'/api/{id}/login/2/{id}',
'/api/{id}/login/3/{id}',
'/api/{id}/login/6/{id}',
'/api/{id}/login/11/{id}',
......
],后面的计算每循环一次还需要clear一次
idList存放是真正参数的下标,如果判断出某个参数是真的参数,则加入到idList(“下标”指的是一条数据中所有包含数字段形成的列表的下标)
apilen是当前pattern包含数字段的格式
cmp是比较pattern中数据段的个数与传入的参数一做对比,取最小的
'''
idList = []
apilen = len(re.findall('[/|;][^/,";]*\d[^/,";]*', keyDict))
cmp = lambda a,b: b if a > b else a if a < b else a
# num很重要,决定了对哪些参数是真是假
for num in range(apilen - (cmp(self.number, apilen) - 1), apilen + 1):
dataList = []
for data in self.patternsDict[keyDict]:
'''
调用getDataString()方法,将数据转换为指定格式,例:
判断倒数第二个参数是否为真,传入'/api/57/login/3/123',传出'/api/{id}/login/3/{id}'
转换后加入到dataList
'''
dataList.append(self.getDataString(data['path'], num))
# 去重,得到当前格式的数据中,“{id}”的值不同的个数
dataSet = set(dataList)
# 如果“{id}”值不同的个数占全部的 probability 以上,则判定当前段是真正的参数,将当前段的下标加入到idList
if len(dataSet) > self.probability:
idList.append(num)
# 存放符合当前pattern格式的原始数据
listData = {}
for data in self.patternsDict[keyDict]:
'''
调用getPattern()方法,将数据转换为指定格式,和getDataString()类似,例:
根据传入的idList中记录的下标,传入'/api/57/login/3/123',传出'/api/57/login/{id1}/{id2}'
'''
dataString = self.getPattern(data['path'] + "," + data['host'] + "," + str(data['port']) + "," + data['method'], idList)
if re.findall('[/|;]{id\d+}[^/,";]*', dataString):
if listData.get(dataString) == None:
listData[dataString] = [data]
else:
listData[dataString].append(data)
else:
data['type'] = 2
self.patternData.append(data)
for key in listData.keys():
dic = {}
patternsurl = key.split(",")
dic['path'] = patternsurl[0]
dic['host'] = patternsurl[1]
dic['port'] = patternsurl[2]
dic['method'] = patternsurl[3]
dic['type'] = 1
if self.on_merge_func != None:
self.on_merge_func(dic, listData[key])
self.patternData.append(dic)
return self.patternData
'''
API merge UT
传入空列表, 能正常工作
'''
def test_one():
list = []
patternMerge = PatternMerge(2, 3, 2, 0.6, None)
list1 = patternMerge.merge(list)
assert list1 == []
print(True)
'''
API merge UT
单参数, 入参(能合并的+ 不能合并的), on_merge能回调成功, 取最大match
'''
def test_two():
patternMerge = PatternMerge(2, 3, 2, 0.6, on_merge)
list = []
for i in range(0, 100):
list.append({
"path": "/aa/" + str(i),
"host": "10.10.65.21",
"port": 80,
"match": i,
"method": "GET"
})
list.append({"path": "/4/login;jsessionid=1", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"})
list.append({"path": "/1/login;jsessionid=3", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"})
list1 = patternMerge.merge(list)
assert (list1[0]['path'] == b'/aa/{id1}')
assert (list1[0]['match'] == 99)
assert (list1[1]['path'] == b'/4/login;jsessionid=1')
assert (list1[2]['path'] == b'/1/login;jsessionid=3')
print(True)
'''
API merge UT
多参数, 只合并最后两个。
'''
def test_three():
patternMerge = PatternMerge(2, 3, 2, 0.6, on_merge)
list = [{"path": "/api/1/2/test/3", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/api/2/1/test/6", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/api/3/21/test/7", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/api/4/2d/test/31", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/api/1/12/test/35", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/api/1/12/test/35/fsdf", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"}
]
list1 = patternMerge.merge(list)
assert list1[0]['path'] == '/api/1/{id1}/test/{id2}'
assert list1[1]['path'] == '/api/2/{id1}/test/{id2}'
assert list1[2]['path'] == '/api/3/{id1}/test/{id2}'
assert list1[3]['path'] == '/api/4/{id1}/test/{id2}'
print(True)
'''
API merge UT
带分号的。可以合并
'''
def test_four():
patternMerge = PatternMerge(2, 3, 2, 0.6, None)
list = [
{"path": "/4/login;jsessionid=1", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/1/login;jsessionid=3", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/2/login;jsessionid=4", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/3/login;jsessionid=5", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"}
]
list1 = patternMerge.merge(list)
assert (list1[0]['path'] == b'/{id1}/login;{id2}')
print(True)
def test_five():
patternMerge = PatternMerge(2, 3, 2, 0.6, on_merge)
list = [
{"path": "/test/a1/0/1", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/test/a1/1/1", "host": "10.10.65.21", "port": 80, "match": 100,
"method": "GET"},
{"path": "/test/a1/2/1", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/test/a1/3/1", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/test/a1/4/1", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"},
{"path": "/test/a1/0/0", "host": "10.10.65.21", "port": 80, "match": 99.2911744266852,
"method": "GET"}
]
list1 = patternMerge.merge(list)
assert (list1[0]['path'] == b'/test/a1/{id1}/1')
assert (list1[1]['path'] == b'/test/a1/{id1}/0')
print(True)
def test_six():
patternMerge = PatternMerge(2, 1, 2, 2, on_merge)
list = [
{'method': '"GET"', 'host': '10.10.69.132', 'path': '/a1/1/$', 'type': 2, 'port': '80', 'match': '-52.3'},
{'method': '"GET"', 'host': '10.10.69.132', 'path': '/a1/2/$', 'type': 2, 'port': '80', 'match': '-52.3'},
{'method': '"GET"', 'host': '10.10.69.132', 'path': '/a1/3/$', 'type': 2, 'port': '80', 'match': '-52.3'}
]
list1 = patternMerge.merge(list)
assert (list1[0]['path'] == b'/a1/{id1}/$')
print(True)
def on_merge(pattern, api_list):
pattern['match'] = max([api['match'] for api in api_list])
if __name__ == '__main__':
# test_one()
# test_two()
# test_three()
# test_four()
# test_five()
# test_six()