#!/usr/bin/env python
# -*- coding:utf-8 -*-
import argparse
import datetime
import random
import re
import json
import copy
import glob
import os
import platform
import uuid
from os.path import dirname, isfile, join, basename
# 判断当前系统架构
g_platform = platform.system().lower()
if g_platform == "windows":
g_path_split = "\\"
else:
g_path_split = "/"
# 脚本中匹配测试步骤文字描述的正则表达式:
g_step_pattern = r'self\.step\("(I|O|P|T)\d\.?\、?\d?\s?(.*?)"\)'
# 当前服务自动化脚本在代码仓的相对路径:
g_relative_path = os.path.join('GaussDBV5', 'gaussdbv5_miniaturization', 'testcases')
# 输出的json文件模板:
g_json_template = {
"product_info": {
"product_line_name": "华为云云服务产品部",
"pdu_name": "数据库服务产品部-GaussDB轻量化"
},
"file_name": "XXX_003.py",
"language": "python",
"test_framework": "TEP_DBSAutos",
"dependencies": "",
"class_name": "",
"git_info": {
"git_url": "https://codehub-g.huawei.com/dbs/UTS/tests.git",
"branch": "master"
},
"path": "",
"tc_info": {
"test_case_name": "",
"test_case_number": "",
"test_case_type": "Function test",
"test_activity": "",
"test_feature": "",
"test_environment_type": "",
"pre_condition": "",
"test_step": "",
"expected_result": ""
},
"code_pair_list": [
{
"prompt": "",
"code": "",
"prompt_id": "",
"prompt_pos": ""
}
],
"test_suite": "",
"dataset_type": "",
"case_id": ""
}
def get_json_files(path):
"""
path: json数据目录
功能:返回该目录及子目录下所有.json文件的绝对路径
"""
json_file_lst = list()
for root, dirs, files in os.walk(path):
for one_file in files:
if one_file.endswith(".json"):
json_file_lst.append(os.path.join(root, one_file))
return json_file_lst
def read_json_content_app(json_file):
"""
json_file: json格式数据
功能:分析json数据,提取测试集脚本文件路径列表和用例文本信息字典
"""
app_file_lst = []
all_tc_info_dict = {}
script_path = ''
with open(json_file, encoding='utf-8') as f:
data = json.load(f)
for one_case in data:
case_num = one_case.get("test_case_number")
script_path = one_case.get("script_path")
local_code_path = one_case.get("local_code_path")
if script_path == '':
# 输入的json文件中无该用例的脚本文件路径,则尝试从"local_code_path"路径下遍历查找
tmp_path = local_code_path + g_path_split + g_relative_path
file_name = case_num + '.py'
is_got, ret_path = find_file(tmp_path, file_name)
if is_got:
script_path = ret_path
if script_path == '':
# 仍然没有找到,则跳过
continue
app_file_lst.append(script_path)
tc_info = one_case.get("case_info")
all_tc_info_dict.update({case_num: tc_info})
return app_file_lst, all_tc_info_dict
def traversal_file(path):
"""
功能:遍历指定目录,返回一个{文件名:文件路径}的字典
:param path: 待遍历的目录
:return:“文件名:文件路径的”字典
"""
ret_dict = {}
for name in os.listdir(path):
full_path = path + g_path_split + name
# print('now full_path ', full_path)
if os.path.isfile(full_path):
ret_dict[name] = full_path
elif os.path.isdir(full_path):
ret_dict.update(traversal_file(full_path))
return ret_dict
def find_file(path, file_name):
"""
:param path: 查找的范围
:param file_name: 待查找的文件名
:return: {bool:找到为True,否则为False, str:文件全路径,不到返回空字符串}
"""
is_got = False
ret_path = ''
for name in os.listdir(path):
full_path = path + g_path_split + name
if os.path.isfile(full_path) and name == file_name:
is_got = True
ret_path = full_path
break
elif os.path.isdir(full_path):
is_got, ret_path = find_file(full_path, file_name)
if is_got:
break
return is_got, ret_path
def read_word_wrap_index(content_list, now_index):
index = now_index + 1
if content_list[index].lstrip().endswith('\\'):
index = read_word_wrap_index(
content_list=content_list, now_index=index)
return index
def get_dependencies(content_list, start_index):
start_index = start_index
stop_index = 0
dept = False
for index in range(start_index, len(content_list)):
line = content_list[index]
if line.startswith('from') or line.startswith('import'):
stop_index = index
if not dept:
dept = True
start_index = index
if line.lstrip().endswith('\\'):
stop_index = read_word_wrap_index(
content_list=content_list, now_index=index)
if line.lstrip().startswith('class'):
break
result = content_list[start_index: stop_index + 1]
dependencies = ''.join(result)
return dependencies, stop_index
def get_class_name(content_list, start_index):
class_name = ''
end_index = start_index
for index in range(start_index, len(content_list)):
line = content_list[index]
if line.lstrip().startswith('class'):
read = re.match(
r'class ([a-zA-Z_\d\-]+)\W?\(BaseTestCase\)', line)
if read:
class_name = read.group(1)
end_index = index
break
return class_name, end_index
def get_tc_info(content_list, start_index):
content_t = list(x.strip() for x in content_list)
stop_index = 0
match_time = 0
for index in range(start_index, len(content_t)):
line = content_t[index]
if re.findall("# -+", line):
match_time = match_time + 1
if match_time == 1:
start_index = index
elif match_time == 3:
stop_index = index + 1
break
case_info = ''.join(content_t[start_index: stop_index])
temp_rst = re.findall('# 用例: (.*)# 级别: ', case_info)
if temp_rst:
test_case_number = temp_rst[0]
else:
test_case_number = ''
temp_rst = re.findall('# description: (.*)# -+#', case_info)
if temp_rst:
test_case_name = temp_rst[0]
else:
test_case_name = ''
temp_rst = re.findall('# 预置条件:(.*)# 测试步骤:', case_info)
if temp_rst:
pre_condition = temp_rst[0].replace('# ', '\n').strip()
else:
pre_condition = ''
temp_rst = re.findall('# 测试步骤:(.*)# 预期结果:', case_info)
if temp_rst:
test_step = temp_rst[0].replace('# ', '\n').strip()
else:
test_step = ''
temp_rst = re.findall('# 预期结果:(.*)# -+', case_info)
if temp_rst:
expected_result = temp_rst[0].replace('# ', '\n').strip()
else:
expected_result = ''
tc_info = {
"test_activity": "",
"test_case_type": "",
"test_feature": "",
"test_environment_type": "",
"test_case_name": test_case_name.strip().replace(
"#",
"").replace(
":",
"").replace(
":",
"").replace(
"...",
""),
"test_case_number": test_case_number.strip().replace(
"#",
"").replace(
":",
"").replace(
":",
"").replace(
"...",
""),
"pre_condition": pre_condition.strip(),
"test_step": test_step.strip(),
"expected_result": expected_result.strip()
}
return tc_info, stop_index
def get_one_pair(pair_content_list, start_index):
prompt = ''
code = ''
end_index = start_index
# tmp_rst = g_chinese_pattern.findall(pair_content_list[start_index].strip())
match_step = re.search(g_step_pattern, pair_content_list[start_index].strip())
if match_step:
prompt = match_step.group(2)
for index in range(start_index + 1, len(pair_content_list)):
cur_line = pair_content_list[index]
end_index = index
if cur_line.strip().startswith("self.step(") or \
"def steps(self):" in cur_line or \
"def teardown(self):" in cur_line:
break
else:
code = code + cur_line.strip() + '\n'
return {"prompt": prompt, "code": code}, end_index
def get_code_pair_list(content_list, start_index, verify, case_id):
code_pair_list = []
i = start_index
pair_start_index = i
while i < len(content_list):
line = content_list[i]
if line.lstrip().startswith('def setup(self):'):
pair_start_index = i
break
i = i + 1
pair_content_list = content_list[pair_start_index:]
j = 0
while j < len(pair_content_list):
if pair_content_list[j].strip().startswith("self.step(") and \
re.search(g_step_pattern, pair_content_list[j].strip()):
pair_info, index = get_one_pair(pair_content_list, j)
# 过滤掉value为空的pair
if pair_info["code"].strip() == "":
j = index
continue
# 添加prompt_pos、code_pos、prompt_id
pair_info["prompt_pos"] = "[{},{}]".format(str(pair_start_index + 1 + j), str(pair_start_index + 1 + j))
pair_info["code_pos"] = "[{},{}]".format(str(pair_start_index + 2 + j), str(pair_start_index + index))
pair_info["prompt_id"] = "{}-{:0>10d}".format(case_id, random.randint(1000000000, 2147483647))
if verify:
pair_info["above_text"] = ''.join(content_list[0:pair_start_index + j])
code_pair_list.append(pair_info)
j = index # 这里不需要+1
else:
j = j + 1
return code_pair_list
def get_result(path, tc_info_dict, verify=False):
"""获得清洗结论
Args:
:param path: 文件路径
:param tc_info_dict: 从输入文件获得的测试用例文本信息
:param verify: True 验证集、False 测试集
"""
file_name = basename(path)
content_list = []
with open(path, 'r', encoding='utf-8') as f:
for line in f:
if line != '\n' or True:
content_list.append(line)
# tc_info, index = get_tc_info(content_list=content_list, start_index=0)
dependencies, index = get_dependencies(content_list=content_list, start_index=0)
class_name, index = get_class_name(
content_list=content_list, start_index=index)
# class_name即为用例编号,通过用例编号找到用例对应的文本描述信息:
tc_info = tc_info_dict[class_name]
# 输入文件没有tc_info内容(通过test_case_name是否有内容判断),则尝试从脚本文件中获取:
if tc_info is None or tc_info == '' or tc_info["test_case_name"] == '':
tc_info, tmp_i = get_tc_info(content_list=content_list, start_index=0)
# print(tc_info)
# 注:case_id非用例编号,这里是给模型识别用的id
case_id = "{}".format(str(uuid.uuid1()))
code_pair_list = get_code_pair_list(
content_list=content_list, start_index=index, verify=verify, case_id=case_id)
if verify:
dataset_type = "test"
else:
dataset_type = "train"
return {
'file_name': file_name,
'dependencies': dependencies,
'class_name': class_name,
'path': path,
'tc_info': tc_info,
'code_pair_list': code_pair_list,
'test_suite': "",
"dataset_type": dataset_type,
"case_id": case_id
}
def save_result(output):
for file_list_slice in batch(result_list, 1000):
try:
prompt_num = 0
tmp_result_list = []
for case in file_list_slice:
if case["code_pair_list"]: # pair_list为空,剔除该用例
prompt_num += len(case["code_pair_list"])
tmp_result_list.append(case)
pair_file_name = str("%s_%s_%s.json" % (
"Corpus_GaussDBForMySQL", str("batch_train_%d" % prompt_num), str(datetime.date.today())))
pair_file_path = os.path.join(output, pair_file_name)
print("pair_file_path:", pair_file_path)
with open(pair_file_path, 'w', encoding='utf-8') as f:
json.dump(tmp_result_list, f, ensure_ascii=False)
except Exception as e:
print(e)
def make_json(**kwargs):
result = copy.deepcopy(g_json_template)
for key, value in list(kwargs.items()):
result[key] = value
return result
def batch(iterable, num):
list_lengths = len(iterable)
result = []
for ndx in range(0, list_lengths, num):
result.append(iterable[ndx:min(ndx + num, list_lengths)])
return result
if __name__ == '__main__':
# 定义需要传入的参数
parser = argparse.ArgumentParser(description="generate json data of train")
parser.add_argument('--input', '-i', type=str, required=True, help='input data path')
parser.add_argument('--output', '-o', type=str, required=True, help='output data path')
parser.add_argument('--verify', '-v', action='store_true', default=False, help='it is a validation set.')
args = parser.parse_args()
if not args.input or not args.output:
raise Exception(f"输入异常,参数 -i 或 -o 不能为空")
input_path = args.input
output_path = args.output
verify = args.verify
if not os.path.exists(output_path):
os.makedirs(output_path, exist_ok=True)
# 获取传入的脚本
app_file_list = []
app_tc_info_dict = {}
json_file_list = get_json_files(input_path)
print("json_file_list:", json_file_list)
for file in json_file_list:
tmp_file_list, tmp_tc_info_dict = read_json_content_app(file)
app_file_list.extend(tmp_file_list)
app_tc_info_dict.update(tmp_tc_info_dict)
# 提取传入脚本L2语料
result_list = []
print(len(app_file_list), len(app_tc_info_dict))
for file_list_slice in batch(app_file_list, 1000):
for file_path in file_list_slice:
try:
if file_path.split(g_path_split)[-1].endswith(".py"):
result_json = make_json(**get_result(file_path, app_tc_info_dict, verify))
if result_json.get("code_pair_list"):
result_list.append(result_json)
# print("result list -------------")
# print(result_list)
except Exception as e:
print(e)
print('%s read failed' % file_path)
# 保存输出文件
save_result(output_path)
帮我解析程序
最新发布