import os
import json
import torch
import librosa
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter
from Levenshtein import distance as lev_distance
import argparse
from tqdm import tqdm
# 定义数据集类
class SpeechDataset(Dataset):
def __init__(self, data_list, dict_path, mean=None, std=None, min_duration=0, max_duration=-1):
super(SpeechDataset, self).__init__()
self.mean = mean
self.std = std
with open(data_list, 'r', encoding='utf-8') as f:
lines = f.readlines()
self.data_list = []
for line in lines:
try:
line = json.loads(line)
except:
continue
if line["duration"] < min_duration:
continue
if max_duration != -1 and line["duration"] > max_duration:
continue
self.data_list.append([line["audio_path"], line["text"]])
with open(dict_path, 'r', encoding='utf-8') as f:
labels = json.load(f)
self.vocabulary = {labels[i]: i for i in range(len(labels))}
def __getitem__(self, idx):
wav_path, transcript = self.data_list[idx]
mfccs = self.load_audio_mfcc(wav_path)
transcript = [self.vocabulary.get(x, 0) for x in transcript]
return torch.FloatTensor(mfccs), torch.LongTensor(transcript)
def __len__(self):
return len(self.data_list)
def load_audio_mfcc(self, wav_path, n_mfcc=128):
wav, sr = librosa.load(wav_path, sr=16000)
mfccs = librosa.feature.mfcc(y=wav, sr=sr, n_mfcc=n_mfcc, n_fft=512, hop_length=128)
if self.mean is not None and self.std is not None:
mfccs = (mfccs - self.mean) / self.std
return mfccs
def collate_fn(batch):
batch = sorted(batch, key=lambda sample: sample[0].shape[1], reverse=True)
freq_size = batch[0][0].shape[0]
max_audio_length = batch[0][0].shape[1]
batch_size = len(batch)
max_label_length = len(batch[0][1])
inputs = torch.zeros((batch_size, freq_size, max_audio_length))
labels = torch.zeros((batch_size, max_label_length), dtype=torch.long)
input_lens = []
label_lens = []
for x in range(batch_size):
tensor = batch[x][0]
target = batch[x][1]
seq_length = tensor.shape[1]
label_length = target.shape[0]
inputs[x, :, :seq_length] = tensor[:, :seq_length]
labels[x, :label_length] = target[:label_length]
input_lens.append(seq_length)
label_lens.append(label_length)
input_lens = torch.LongTensor(input_lens)
label_lens = torch.LongTensor(label_lens)
return inputs, labels, input_lens, label_lens
# 定义模型
class GLU(nn.Module):
def __init__(self, axis):
super(GLU, self).__init__()
self.sigmoid = nn.Sigmoid()
self.axis = axis
def forward(self, x):
a, b = torch.split(x, x.size(self.axis) // 2, dim=self.axis)
act_b = self.sigmoid(b)
return a * act_b
class ConvBlock(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride, padding=0, p=0.5):
super(ConvBlock, self).__init__()
self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, stride, padding)
self.conv = nn.utils.weight_norm(self.conv)
self.act = GLU(axis=1)
self.dropout = nn.Dropout(p)
def forward(self, x):
x = self.conv(x)
x = self.act(x)
x = self.dropout(x)
return x
class SpeechRecognitionModel(nn.Module):
def __init__(self, vocabulary, data_mean=None, data_std=None):
super(SpeechRecognitionModel, self).__init__()
if data_mean is None:
data_mean = torch.tensor(1.0)
if data_std is None:
data_std = torch.tensor(1.0)
self.register_buffer("data_mean", data_mean)
self.register_buffer("data_std", data_std)
self.output_units = len(vocabulary) + 1
self.conv1 = ConvBlock(128, 500, 48, 2, padding=97, p=0.2)
self.conv2 = ConvBlock(250, 500, 7, 1, p=0.3)
self.conv3 = ConvBlock(250, 2000, 32, 1, p=0.3)
self.conv4 = ConvBlock(1000, 2000, 1, 1, p=0.3)
self.out = nn.utils.weight_norm(nn.Conv1d(1000, self.output_units, 1, 1))
def forward(self, x, input_lens=None):
x = self.conv1(x)
for _ in range(7):
x = self.conv2(x)
x = self.conv3(x)
x = self.conv4(x)
x = self.out(x)
if input_lens is not None:
return x, input_lens / 2 + 1
return x
# 定义解码器和训练函数
class GreedyDecoder:
def __init__(self, vocabulary):
self.int_to_char = {i: c for c, i in vocabulary.items()}
def convert_to_strings(self, sequences):
strings = []
for sequence in sequences:
string = ''.join([self.int_to_char.get(x, '') for x in sequence])
strings.append(string)
return strings
def cer(self, s1, s2):
return lev_distance(s1, s2) / len(s2)
def train(args):
# 创建日志目录
log_dir = os.path.join(os.getcwd(), 'logs')
os.makedirs(log_dir, exist_ok=True)
print(f"Log directory: {os.path.abspath(log_dir)}")
# 日志记录器
writer = SummaryWriter(log_dir=log_dir)
# 加载数据集
train_dataset = SpeechDataset(args.train_manifest, args.dataset_vocab, mean=args.data_mean, std=args.data_std,
min_duration=args.min_duration, max_duration=args.max_duration)
train_loader = DataLoader(train_dataset, batch_size=args.batch_size, collate_fn=collate_fn, num_workers=args.num_workers)
test_dataset = SpeechDataset(args.test_manifest, args.dataset_vocab, mean=args.data_mean, std=args.data_std)
test_loader = DataLoader(test_dataset, batch_size=args.batch_size, collate_fn=collate_fn, num_workers=args.num_workers)
# 初始化模型、解码器和优化器
vocabulary = train_dataset.vocabulary
model = SpeechRecognitionModel(vocabulary)
if args.pretrained_model:
model.load_state_dict(torch.load(args.pretrained_model))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
decoder = GreedyDecoder(vocabulary)
optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)
ctc_loss = nn.CTCLoss()
# 训练循环
for epoch in range(args.num_epoch):
model.train()
train_loader_iter = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{args.num_epoch}', unit='batch')
for batch_idx, (inputs, labels, input_lens, label_lens) in enumerate(train_loader_iter):
inputs = inputs.to(device)
labels = labels.to(device)
input_lens = input_lens.to(device)
label_lens = label_lens.to(device)
optimizer.zero_grad()
outputs = model(inputs)
outputs = outputs.log_softmax(2).transpose(0, 1)
loss = ctc_loss(outputs, labels, input_lens, label_lens)
loss.backward()
optimizer.step()
train_loader_iter.set_postfix(loss=loss.item())
# 测试循环
model.eval()
total_cer = 0
with torch.no_grad():
test_loader_iter = tqdm(test_loader, desc='Validation', unit='batch')
for inputs, labels, input_lens, label_lens in test_loader_iter:
inputs = inputs.to(device)
labels = labels.to(device)
input_lens = input_lens.to(device)
outputs = model(inputs)
outputs = outputs.log_softmax(2).transpose(0, 1)
decoded = decoder.convert_to_strings(outputs.argmax(2).permute(1, 0).tolist())
reference = decoder.convert_to_strings(labels.tolist())
for d, r in zip(decoded, reference):
total_cer += decoder.cer(d, r)
test_loader_iter.set_postfix(cer=total_cer / len(test_loader.dataset))
total_cer /= len(test_loader.dataset)
print(f'\nTest Epoch: {epoch} CER: {total_cer}')
writer.add_scalar('CER/test', total_cer, epoch)
# 保存模型
os.makedirs(args.save_model_path, exist_ok=True)
torch.save(model.state_dict(), os.path.join(args.save_model_path, f'epoch_{epoch}.pt'))
if __name__ == '__main__':
# 参数设置
parser = argparse.ArgumentParser(description='Speech Recognition Training')
parser.add_argument('--batch-size', type=int, default=32, help='batch size for training (default: 32)')
parser.add_argument('--num-workers', type=int, default=4, help='number of workers for training (default: 4)')
parser.add_argument('--num-epoch', type=int, default=50, help='number of epochs for training (default: 50)')
parser.add_argument('--learning-rate', type=float, default=1e-3, help='learning rate for training (default: 1e-3)')
parser.add_argument('--data-mean', type=float, default=-2.427870, help='mean for data normalization (default: -2.427870)')
parser.add_argument('--data-std', type=float, default=44.181725, help='std for data normalization (default: 44.181725)')
parser.add_argument('--min-duration', type=int, default=0, help='minimum duration for audio (default: 0)')
parser.add_argument('--max-duration', type=int, default=20, help='maximum duration for audio (default: 20)')
parser.add_argument('--train-manifest', default='dataset/manifest.train', help='path to train manifest (default: dataset/manifest.train)')
parser.add_argument('--test-manifest', default='dataset/manifest.test', help='path to test manifest (default: dataset/manifest.test)')
parser.add_argument('--dataset-vocab', default='dataset/zh_vocab.json', help='path to dataset vocab (default: dataset/zh_vocab.json)')
parser.add_argument('--save-model-path', default='models/', help='path to save models (default: models/)')
parser.add_argument('--pretrained-model', default=None, help='path to pretrained model (default: None)')
args = parser.parse_args()
# 设置数据集路径
args.train_manifest = r'C:\Users\lenovo\Desktop\深度学习\dataset\data_thchs30\data_thchs30\train'
args.test_manifest = r'C:\Users\lenovo\Desktop\深度学习\dataset\data_thchs30\data_thchs30\test'
args.dataset_vocab = r'C:\Users\lenovo\Desktop\深度学习\dataset\data_thchs30\data_thchs30\data'
train(args)基于这个代码使用pytoch基于深度学习的语音识别模型设计与实现(1)代码中变量、函数、类的命名要规范。变量、函数、类要有注释来说明它的功能(注释的行数不能少于代码行数1/3)。具体代码规范请参考:
[1] https://www.jianshu.com/p/8b6c425b65a6 ,
[2]https://zhuanlan.zhihu.com/p/94634073
(2)代码组织要规范,数据要求放在dataset目录,代码要求放在src目录,主程序放在main.py文件中。如果代码需要安装其它的包,请将包名和版本加入到requires文件中。(3)数据集为data_thchs30.tgz数据集介绍
THCHS30是由清华大学语音与语言技术中心(CSLT)发布的开放式中文语音数据库。 数据集为data_thchs30.tgz,包含训练集和测试集,共13000余条数据。数据集目录如图所示,data,train,test,里是音频和对应中文
最新发布