PyTorch-Kaldi工具箱简介及核心代码注解

最新推荐文章于 2024-09-25 08:10:10 发布

原创

最新推荐文章于 2024-09-25 08:10:10 发布 · 2.8k 阅读

12 ·

CC 4.0 BY-SA版权

文章标签：

#语音识别 #pytorch-kaldi #kaldi #pytorch

PyTorch-Kaldi简介

PyTroch-Kaldi是一款新推出的语音识别工具箱。由名字可以看出来，它是pytroch和kaldi的混合体。由于Kaldi内部的DNN拓展性较差（若需要添加新的网络Component，需要自己添加propagate和backpropagate)，所以作者构建了一个PyTroch-Kaldi工具箱，工具箱的框架如下图所示。
该工具箱依然使用DNN-HMM混合模型进行声学模型的建模，但其DNN部分由Pytorch实现，而特征提取、标签/对齐计算和和解码则使用依旧使用Kaldi完成。这大大简化了声学模型中DNN的构造难度。
Pytorch-Kaldi框架
该项目在Github上的地址为：项目地址
arxiv上论文地址为：论文地址

PyTorch-Kaldi核心逻辑

PyTorch-Kaldi的核心逻辑如下图所示。图中的虚线框表示一个Python文件。虚线箭头表示某步需要一个调用一个新的Python文件。

核心代码注释

为了更为全面的理解PyTorch-Kaldi的代码逻辑、方便进行大家对框架进行修改，这里选取了一些PyTorch-Kaldi中最重要的代码进行了注释。下列代码的注释可以直接点击百度云链接进行下载。
run_exp.py


# Reading global cfg file (first argument-mandatory file) 
cfg_file=sys.argv[1]
if not(os.path.exists(cfg_file)):
     sys.stderr.write('ERROR: The config file %s does not exist!\n'%(cfg_file))
     sys.exit(0)
else:
    config = configparser.ConfigParser()
    config.read(cfg_file)


# Reading and parsing optional arguments from command line (e.g.,--optimization,lr=0.002)
[section_args,field_args,value_args]=read_args_command_line(sys.argv,config)


# Output folder creation
out_folder=config['exp']['out_folder']
if not os.path.exists(out_folder):
    os.makedirs(out_folder+'/exp_files')

# Log file path    
log_file=config['exp']['out_folder']+'/log.log'


    
# Read, parse, and check the config file     
cfg_file_proto=config['cfg_proto']['cfg_proto']
[config,name_data,name_arch]=check_cfg(cfg_file,config,cfg_file_proto)


# Read cfg file options
is_production=strtobool(config['exp']['production'])   #“产品” 模式   不训练模型，只使用之前训练好的模型进行正向传播和解码
cfg_file_proto_chunk=config['cfg_proto']['cfg_proto_chunk']

cmd=config['exp']['cmd']
N_ep=int(config['exp']['N_epochs_tr'])
N_ep_str_format='0'+str(max(math.ceil(np.log10(N_ep)),1))+'d'
tr_data_lst=config['data_use']['train_with'].split(',')
valid_data_lst=config['data_use']['valid_with'].split(',')
forward_data_lst=config['data_use']['forward_with'].split(',')
max_seq_length_train=config['batches']['max_seq_length_train']
forward_save_files=list(map(strtobool,config['forward']['save_out_file'].split(',')))


print("- Reading config file......OK!")

     
# Copy the global cfg file into the output folder
cfg_file=out_folder+'/conf.cfg'
with open(cfg_file, 'w') as configfile:   
    config.write(configfile) 
    

# Load the run_nn function from core libriary    
# The run_nn is a function that process a single chunk of data   #run_nn是用来处理单个块数据的函数
run_nn_script=config['exp']['run_nn_script'].split('.py')[0]
module = importlib.import_module('core')
run_nn=getattr(module, run_nn_script)

         
         
# Splitting data into chunks (see out_folder/additional_files)
create_lists(config)

# Writing the config files
create_configs(config)

print("- Chunk creation......OK!\n")

# create res_file
res_file_path=out_folder+'/res.res' #文件res.res总结了各个时期的训练和评估表现。
res_file = open(res_file_path, "w")
res_file.close()



# Learning rates and architecture-specific optimization parameters
arch_lst=get_all_archs(config) #获得所有层模型的cfg数据
lr={}
auto_lr_annealing={}
improvement_threshold={}
halving_factor={}
pt_files={}

for arch in arch_lst:
    lr[arch]=expand_str_ep(config[arch]['arch_lr'],'float',N_ep,'|','*') #学习率
    if len(config[arch]['arch_lr'].split('|'))>1:
       auto_lr_annealing[arch]=False
    else:
       auto_lr_annealing[arch]=True 
    improvement_threshold[arch]=float(config[arch]['arch_improvement_threshold'])
    halving_factor[arch]=float(config[arch]['arch_halving_factor']) #对半影响
    pt_files[arch]=config[arch]['arch_pretrain_file'] #pre-train模型

    
# If production, skip training and forward directly from last saved models
if is_production:
    ep           = N_ep-1   #跳过TRAINING LOOP
    N_ep         = 0
    model_files  = {}

    for arch in pt_files.keys():
        model_files[arch] = out_folder+'/exp_files/final_'+arch+'.pkl'  #.pkl模型是用于语音解码的最终模型


op_counter=1 # used to dected the next configuration file from the list_chunks.txt

# Reading the ordered list of config file to process 
cfg_file_list = [line.rstrip('\n') for line in open(out_folder+'/exp_files/list_chunks.txt')]
cfg_file_list.append(cfg_file_list[-1])


# A variable that tells if the current chunk is the first one that is being processed:
processed_first=True

data_name=[]
data_set=[]
data_end_index=[]
fea_dict=[]
lab_dict=[]
arch_dict=[]

 
# --------TRAINING LOOP--------#
for ep in range(N_ep):
    
    tr_loss_tot=0
    tr_error_tot=0
    tr_time_tot=0
    
    print('------------------------------ Epoch %s / %s ------------------------------'%(format(ep, N_ep_str_format),format(N_ep-1, N_ep_str_format)))

    for tr_data in tr_data_lst:
        
        # Compute the total number of chunks for each training epoch
        N_ck_tr=compute_n_chunks(out_folder,tr_data,ep,N_ep_str_format,'train')
        N_ck_str_format='0'+str(max(math.ceil(np.log10(N_ck_tr)),1))+'d'
     
        # ***Epoch training***
        for ck in range(N_ck_tr): #训练模型
            
            
            # paths of the output files (info,model,chunk_specific cfg file)
            info_file=out_folder+'/exp_files/train_'+tr_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.info' #train.info文件报告每个训练块的损失和错误性能。
            
            if ep+ck==0:
                model_files_past={}
            else:
                model_files_past=model_files
                
            model_files={}
            for arch in pt_files.keys():
                model_files[arch]=info_file.replace('.info','_'+arch+'.pkl')
            
            config_chunk_file=out_folder+'/exp_files/train_'+tr_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.cfg'
            
            # update learning rate in the cfg file (if needed)
            change_lr_cfg(config_chunk_file,lr,ep)
                        
            
            # if this chunk has not already been processed, do training...
            if not(os.path.exists(info_file)):
                
                    print('Training %s chunk = %i / %i' %(tr_data,ck+1, N_ck_tr))
                                 
                    # getting the next chunk 
                    next_config_file=cfg_file_list[op_counter]

                        
                    # run chunk processing    #训练模型                 
                    [data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict]=run_nn(data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict,config_chunk_file,processed_first,next_config_file)
                                        
                    
                    # update the first_processed variable
                    processed_first=False
                                                            
                    if not(os.path.exists(info_file)):
                        sys.stderr.write("ERROR: training epoch %i, chunk %i not done! File %s does not exist.\nSee %s \n" % (ep,ck,info_file,log_file))
                        sys.exit(0)
                                  
            # update the operation counter
            op_counter+=1          
            
            
            # update pt_file (used to initialized the DNN for the next chunk)  
            for pt_arch in pt_files.keys():
                pt_files[pt_arch]=out_folder+'/exp_files/train_'+tr_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'_'+pt_arch+'.pkl'
                
            # remove previous pkl files
            if len(model_files_past.keys())>0:
                for pt_arch in pt_files.keys():
                    if os.path.exists(model_files_past[pt_arch]):
                        os.remove(model_files_past[pt_arch]) 
    
    
        # Training Loss and Error    
        tr_info_lst=sorted(glob.glob(out_folder+'/exp_files/train_'+tr_data+'_ep'+format(ep, N_ep_str_format)+'*.info'))
        [tr_loss,tr_error,tr_time]=compute_avg_performance(tr_info_lst)
        
        tr_loss_tot=tr_loss_tot+tr_loss
        tr_error_tot=tr_error_tot+tr_error
        tr_time_tot=tr_time_tot+tr_time
        
        
        # ***Epoch validation***
        if ep>0:
            # store previous-epoch results (useful for learnig rate anealling)
            valid_peformance_dict_prev=valid_peformance_dict
        
        valid_peformance_dict={}  
        tot_time=tr_time  
    
    
    for valid_data in valid_data_lst:  #验证数据集
        
        # Compute the number of chunks for each validation dataset
        N_ck_valid=compute_n_chunks(out_folder,valid_data,ep,N_ep_str_format,'valid')
        N_ck_str_format='0'+str(max(math.ceil(np.log10(N_ck_valid)),1))+'d'
    
        for ck in range(N_ck_valid):
            
            
            # paths of the output files
            info_file=out_folder+'/exp_files/valid_'+valid_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.info'            
            config_chunk_file=out_folder+'/exp_files/valid_'+valid_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.cfg'
    
            # Do validation if the chunk was not already processed
            if not(os.path.exists(info_file)):
                print('Validating %s chunk = %i / %i' %(valid_data,ck+1,N_ck_valid))
                    
                # Doing eval
                
                # getting the next chunk 
                next_config_file=cfg_file_list[op_counter]
                                         
                # run chunk processing                    
                [data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict]=run_nn(data_name,data_set,data_end_index,fea_dict,lab_dict,arch_dict,config_chunk_file,processed_first,next_config_file)
                                                   
                # update the first_processed variable
                processed_first=False
                
                if not(os.path.exists(info_file)):
                    sys.stderr.write("ERROR: validation on epoch %i, chunk %i of dataset %s not done! File %s does not exist.\nSee %s \n" % (ep,ck,valid_data,info_file,log_file))
                    sys.exit(0)
    
            # update the operation counter
            op_counter+=1
        
        # Compute validation performance  
        valid_info_lst=sorted(glob.glob(out_folder+'/exp_files/valid_'+valid_data+'_ep'+format(ep, N_ep_str_format)+'*.info'))
        [valid_loss,valid_error,valid_time]=compute_avg_performance(valid_info_lst)
        valid_peformance_dict[valid_data]=[valid_loss,valid_error,valid_time]
        tot_time=tot_time+valid_time
       
        
    # Print results in both res_file and stdout #打印结果到输出文件中
    dump_epoch_results(res_file_path, ep, tr_data_lst, tr_loss_tot, tr_error_tot, tot_time, valid_data_lst, valid_peformance_dict, lr, N_ep)

        
    # Check for learning rate annealing   学习率退火处理
    if ep>0:
        # computing average validation error (on all the dataset specified)
        err_valid_mean=np.mean(np.asarray(list(valid_peformance_dict.values()))[:,1])
        err_valid_mean_prev=np.mean(np.asarray(list(valid_peformance_dict_prev.values()))[:,1])
        
        for lr_arch in lr.keys():
            # If an external lr schedule is not set, use newbob learning rate anealing
            if ep<N_ep-1 and auto_lr_annealing[lr_arch]:
                if ((err_valid_mean_prev-err_valid_mean)/err_valid_mean)<improvement_threshold[lr_arch]:
                    lr[lr_arch][ep+1]=str(float(lr[lr_arch][ep])*halving_factor[lr_arch])

# Training has ended, copy the last .pkl to final_arch.pkl for production #  完成训练，pkl是模型文件
for pt_arch in pt_files.keys():
    if os.path.exists(model_files[pt_arch]) and not os.path.exists(out_folder+'/exp_files/final_'+pt_arch+'.pkl'):
        copyfile(model_files[pt_arch], out_folder+'/exp_files/final_'+pt_arch+'.pkl')
  
                
# --------FORWARD--------#
for forward_data in forward_data_lst:  # forward_data_lst就是配置文件中的 forward_with
           
         # Compute the number of chunks
         N_ck_forward=compute_n_chunks(out_folder,forward_data,ep,N_ep_str_format,'forward')# chunk块数      可用数字1代替
         N_ck_str_format='0'+str(max(math.ceil(np.log10(N_ck_forward)),1))+'d'
         
         for ck in range(N_ck_forward):
            
            if not is_production:
                print('Testing %s chunk = %i / %i' %(forward_data,ck+1, N_ck_forward))
            else: 
                print('Forwarding %s chunk = %i / %i' %(forward_data,ck+1, N_ck_forward))
            
            # output file
            info_file=out_folder+'/exp_files/forward_'+forward_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.info'  #info文件    保存计算时间
            config_chunk_file=out_folder+'/exp_files/forward_'+forward_data+'_ep'+format(ep, N_ep_str_format)+'_ck'+format(ck, N_ck_str_format)+'.cfg' #cfg文件保存该步的cfg配置   这里的cfg文件以前就存在

            
            # Do forward if the chunk was not already processed
            if not(os.path.exists(info_file))