PyTorch-Kaldi简介
PyTroch-Kaldi是一款新推出的语音识别工具箱。由名字可以看出来,它是pytroch和kaldi的混合体。由于Kaldi内部的DNN拓展性较差(若需要添加新的网络Component,需要自己添加propagate和backpropagate),所以作者构建了一个PyTroch-Kaldi工具箱,工具箱的框架如下图所示。
该工具箱依然使用DNN-HMM混合模型进行声学模型的建模,但其DNN部分由Pytorch实现,而特征提取、标签/对齐计算和和解码则使用依旧使用Kaldi完成。这大大简化了声学模型中DNN的构造难度。
该项目在Github上的地址为:项目地址
arxiv上论文地址为:论文地址
PyTorch-Kaldi核心逻辑
PyTorch-Kaldi的核心逻辑如下图所示。图中的虚线框表示一个Python文件。虚线箭头表示某步需要一个调用一个新的Python文件。
核心代码注释
为了更为全面的理解PyTorch-Kaldi的代码逻辑、方便进行大家对框架进行修改,这里选取了一些PyTorch-Kaldi中最重要的代码进行了注释。下列代码的注释可以直接点击百度云链接进行下载。
run_exp.py
# Reading global cfg file (first argument-mandatory file)
cfg_file=sys.argv[1]
if not(os.path.exists(cfg_file)):
sys.stderr.write('ERROR: The config file %s does not exist!\n'%(cfg_file))
sys.exit(0)
else:
config = configparser.ConfigParser()
config.read(cfg_file)
# Reading and parsing optional arguments from command line (e.g.,--optimization,lr=0.002)
[section_args,field_args,value_args]=read_args_command_line(sys.argv,config)
# Output folder creation
out_folder=config['exp']['out_folder']
if not os.path.exists(out_folder):
os.makedirs(out_folder+'/exp_files')
# Log file path
log_file=config['exp']['out_folder']+'/log.log'
# Read, parse, and check the config file
cfg_file_proto=config['cfg_proto']['cfg_proto']
[config,name_data,name_arch]=check_cfg(cfg_file,config,cfg_file_proto)
# Read cfg file options
is_production=strtobool(config['exp']['production']) #“产品” 模式 不训练模型,只使用之前训练好的模型进行正向传播和解码
cfg_file_proto_chunk=config['cfg_proto']['cfg_proto_chunk']
cmd=config['exp']['cmd']
N_ep=int(config['exp']['N_epochs_tr'])
N_ep_str_format='0'+str(max(math.ceil(np.log10(N_ep)),1))+'d'
tr_data_lst=config['data_use']['train_with'].split(',')
valid_data_lst=config['data_use']['valid_with'].split(',')
forward_data_lst=config['data_use']['forward_with'].split(',')
max_seq_length_train=config['batches']['max_seq_length_train']
forward_save_files=list(map(strtobool,config['forward']['save_out_file'].split(',')))
print("- Reading config file......OK!")
# Copy the global cfg file into the output folder
cfg_file=out_folder+'/conf.cfg'
with open(cfg_file, 'w') as configfile:
config.write(configfile)
# Load the run_nn function from core libriary
# The run_nn is a function that process a single chunk of data #run_nn是用来处理单个块数据的函数
run_nn_script=config['exp']['run_nn_script'].split('.py')[0]
module = importlib.import_module('core')
run_nn=getattr(module, run_nn_script)
# Splitting data into chunks (see out_folder/additional_files)
create_lists(config)
# Writing the config files
create_configs(config)
print("- Chunk creation......OK!\n")
# create res_file
res_file_path=out_folder+'/res.res' #文件res.res总结了各个时期的训练和评估表现。
res_file = open(res_file_path, "w")
res_file.close()
# Learning rates and architecture-specific optimization parameters
arch_lst=get_all_archs(config) #获得所有层模型的cfg数据
lr={}
auto_lr_annealing={}
improvement_threshold={}
halving_factor={}
pt_files={}
for arch in arch_lst:
lr[arch]=expand_str_ep(config[arch]['arch_lr'],'float',N_ep,'|','*') #学习率
if len(config[arch]['arch_lr'].split('|'))>1:
auto_lr_annealing[arch]=False
else:
auto_lr_annealing[arch]=True
improvement_threshold[arch]=float(config[arch]['arch_improvement_threshold'])
halving_factor[arch]=float(config[arch]['arch_halving_factor']) #对半影响
pt_files[arch]=config[arch]['arch_pretrain_file'] #pre-train模型
# If production, skip training and forward directly from last saved models
if is_production:
ep = N_ep-1 #跳过TRAINING LOOP
N_ep = 0
model_files = {}
for arch in pt_files.keys():
model_files[arch] = out_folder+'/exp_files/final_'+arch+'.pkl' #.pkl模型是用于语音解码的最终模型
op_counter=1 # used to dected the next configuration file from the list_chunks.txt
# Reading the ordered list of config file to process
cfg_file_list = [line.rstrip('\n') for line in open(out_folder+'/exp_files/list_chunks.txt')]
cfg_file_list.append(cfg_file_list[-1])
# A variable that tells if the current chunk is the first one that is being processed:
processed_first=True
data_name=[]
data_set=[]
data_end_index=[]
fea_dict=[]
lab_dict=[]
arch_dict=[]
# --------TRAINING LOOP--------#
for ep in range(N_ep):
tr_loss_tot=0
tr_error_tot=0
tr_time_tot=0
print('------------------------------ Epoch %s / %s ------------------------------'%(format(ep, N_ep_str_format),format(N_ep-1, N_ep_str_format)))
for tr_data in tr_data_lst:
# Compute the total number of chunks for each training epoch
N_ck_tr=compute_n_chunks(out_folder,tr_data,ep,N_ep_str_format,'train')
N_ck_str_format='0'+str(max(math.ceil(np.log10(N_ck_tr)),1))+'d'
# ***Epoch training***
for ck in range(N_ck_tr): #训练模型
# paths of the output files (info,model,chunk_specific cfg file)
info_file=out_folder+'/exp_files/train_'+tr_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.info' #train.info文件报告每个训练块的损失和错误性能。
if ep+ck==0:
model_files_past={}
else:
model_files_past=model_files
model_files={}
for arch in pt_files.keys():
model_files[arch]=info_file.replace('.info','_'+arch+'.pkl')
config_chunk_file=out_folder+'/exp_files/train_'+tr_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.cfg'
# update learning rate in the cfg file (if needed)
change_lr_cfg(config_chunk_file,lr,ep)
# if this chunk has not already been processed, do training...
if not(os.path.exists(info_file)):
print('Training %s chunk = %i / %i' %(tr_data,ck+1, N_ck_tr))
# getting the next chunk
next_config_file=cfg_file_list[op_counter]
# run chunk processing #训练模型
[data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict]=run_nn(data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict,config_chunk_file,processed_first,next_config_file)
# update the first_processed variable
processed_first=False
if not(os.path.exists(info_file)):
sys.stderr.write("ERROR: training epoch %i, chunk %i not done! File %s does not exist.\nSee %s \n" % (ep,ck,info_file,log_file))
sys.exit(0)
# update the operation counter
op_counter+=1
# update pt_file (used to initialized the DNN for the next chunk)
for pt_arch in pt_files.keys():
pt_files[pt_arch]=out_folder+'/exp_files/train_'+tr_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'_'+pt_arch+'.pkl'
# remove previous pkl files
if len(model_files_past.keys())>0:
for pt_arch in pt_files.keys():
if os.path.exists(model_files_past[pt_arch]):
os.remove(model_files_past[pt_arch])
# Training Loss and Error
tr_info_lst=sorted(glob.glob(out_folder+'/exp_files/train_'+tr_data+'_ep'+format(ep, N_ep_str_format)+'*.info'))
[tr_loss,tr_error,tr_time]=compute_avg_performance(tr_info_lst)
tr_loss_tot=tr_loss_tot+tr_loss
tr_error_tot=tr_error_tot+tr_error
tr_time_tot=tr_time_tot+tr_time
# ***Epoch validation***
if ep>0:
# store previous-epoch results (useful for learnig rate anealling)
valid_peformance_dict_prev=valid_peformance_dict
valid_peformance_dict={}
tot_time=tr_time
for valid_data in valid_data_lst: #验证数据集
# Compute the number of chunks for each validation dataset
N_ck_valid=compute_n_chunks(out_folder,valid_data,ep,N_ep_str_format,'valid')
N_ck_str_format='0'+str(max(math.ceil(np.log10(N_ck_valid)),1))+'d'
for ck in range(N_ck_valid):
# paths of the output files
info_file=out_folder+'/exp_files/valid_'+valid_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.info'
config_chunk_file=out_folder+'/exp_files/valid_'+valid_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.cfg'
# Do validation if the chunk was not already processed
if not(os.path.exists(info_file)):
print('Validating %s chunk = %i / %i' %(valid_data,ck+1,N_ck_valid))
# Doing eval
# getting the next chunk
next_config_file=cfg_file_list[op_counter]
# run chunk processing
[data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict]=run_nn(data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict,config_chunk_file,processed_first,next_config_file)
# update the first_processed variable
processed_first=False
if not(os.path.exists(info_file)):
sys.stderr.write("ERROR: validation on epoch %i, chunk %i of dataset %s not done! File %s does not exist.\nSee %s \n" % (ep,ck,valid_data,info_file,log_file))
sys.exit(0)
# update the operation counter
op_counter+=1
# Compute validation performance
valid_info_lst=sorted(glob.glob(out_folder+'/exp_files/valid_'+valid_data+'_ep'+format(ep, N_ep_str_format)+'*.info'))
[valid_loss,valid_error,valid_time]=compute_avg_performance(valid_info_lst)
valid_peformance_dict[valid_data]=[valid_loss,valid_error,valid_time]
tot_time=tot_time+valid_time
# Print results in both res_file and stdout #打印结果到输出文件中
dump_epoch_results(res_file_path, ep, tr_data_lst, tr_loss_tot, tr_error_tot, tot_time, valid_data_lst, valid_peformance_dict, lr, N_ep)
# Check for learning rate annealing 学习率退火处理
if ep>0:
# computing average validation error (on all the dataset specified)
err_valid_mean=np.mean(np.asarray(list(valid_peformance_dict.values()))[:,1])
err_valid_mean_prev=np.mean(np.asarray(list(valid_peformance_dict_prev.values()))[:,1])
for lr_arch in lr.keys():
# If an external lr schedule is not set, use newbob learning rate anealing
if ep<N_ep-1 and auto_lr_annealing[lr_arch]:
if ((err_valid_mean_prev-err_valid_mean)/err_valid_mean)<improvement_threshold[lr_arch]:
lr[lr_arch][ep+1]=str(float(lr[lr_arch][ep])*halving_factor[lr_arch])
# Training has ended, copy the last .pkl to final_arch.pkl for production # 完成训练,pkl是模型文件
for pt_arch in pt_files.keys():
if os.path.exists(model_files[pt_arch]) and not os.path.exists(out_folder+'/exp_files/final_'+pt_arch+'.pkl'):
copyfile(model_files[pt_arch], out_folder+'/exp_files/final_'+pt_arch+'.pkl')
# --------FORWARD--------#
for forward_data in forward_data_lst: # forward_data_lst就是配置文件中的 forward_with
# Compute the number of chunks
N_ck_forward=compute_n_chunks(out_folder,forward_data,ep,N_ep_str_format,'forward')# chunk块数 可用数字1代替
N_ck_str_format='0'+str(max(math.ceil(np.log10(N_ck_forward)),1))+'d'
for ck in range(N_ck_forward):
if not is_production:
print('Testing %s chunk = %i / %i' %(forward_data,ck+1, N_ck_forward))
else:
print('Forwarding %s chunk = %i / %i' %(forward_data,ck+1, N_ck_forward))
# output file
info_file=out_folder+'/exp_files/forward_'+forward_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.info' #info文件 保存计算时间
config_chunk_file=out_folder+'/exp_files/forward_'+forward_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.cfg' #cfg文件保存该步的cfg配置 这里的cfg文件以前就存在
# Do forward if the chunk was not already processed
if not(os.path.exists(info_file))