在完成fastspeech论文学习后,对github上一个复现的仓库进行学习,帮助理解算法实现过程中的一些细节;所选择的仓库复现仓库是基于pytorch实现,链接为https://github.com/xcmyz/FastSpeech。该仓库使用的数据集为LJSpeech,数据处理部分的代码见笔记“fastspeech复现github项目–数据准备”。本笔记对模型搭建的代码进行详细的注释,项目中与模型构建相关的是transformer路径下文件以及modules.py和model.py,希望本笔记中的代码注释能帮助读者理解FastSpeech。
transformer
因FastSpeech中FFT块和用于训练duration predictor的对齐信息提取都与transformer相关的,本项目把相关的文件都放在了transformer路径下
Constants.py
本文件中设置数据处理中可能会使用到一些常量
PAD = 0
UNK = 1
BOS = 2
EOS = 3
PAD_WORD = '<blank>'
UNK_WORD = '<unk>'
BOS_WORD = '<s>'
EOS_WORD = '</s>'
Modules.py
本文件中是实现了一个点乘模块,用于计算注意力
import torch
import torch.nn as nn
import numpy as np
# 自定义的点乘模块,用于计算注意力
class ScaledDotProductAttention(nn.Module):
''' Scaled Dot-Product Attention '''
def __init__(self, temperature, attn_dropout=0.1):
super().__init__()
self.temperature = temperature
self.dropout = nn.Dropout(attn_dropout)
self.softmax = nn.Softmax(dim=2)
def forward(self, q, k, v, mask=None):
# 对q、k进行矩阵乘法
attn = torch.bmm(q, k.transpose(1, 2))
attn = attn / self.temperature # 除以温度参数
if mask is not None:
attn = attn.masked_fill(mask, -np.inf)
attn = self.softmax(attn)
attn = self.dropout(attn)
output = torch.bmm(attn, v)
return output, attn
SubLayers.py
本文件中定义了FFT块中的多头注意力层和基于位置的前馈神经网络(FFN)
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from transformer.Modules import ScaledDotProductAttention
import hparams as hp
# 自定义的多头注意力模块
class MultiHeadAttention(nn.Module):
''' Multi-Head Attention module '''
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
super().__init__()
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
# 计算注意力之前,对q、k、v进行映射的线性层
self.w_qs = nn.Linear(d_model, n_head * d_k)
self.w_ks = nn.Linear(d_model, n_head * d_k)
self.w_vs = nn.Linear(d_model, n_head * d_v)
# 初始化上述线性层
nn.init.normal_(self.w_qs.weight, mean=0,
std=np.sqrt(2.0 / (d_model + d_k)))
nn.init.normal_(self.w_ks.weight, mean=0,
std=np.sqrt(2.0 / (d_model + d_k)))
nn.init.normal_(self.w_vs.weight, mean=0,
std=np.sqrt(2.0 / (d_model + d_v)))
self.attention = ScaledDotProductAttention(
temperature=np.power(d_k, 0.5))
self.layer_norm = nn.LayerNorm(d_model)
self.fc = nn.Linear(n_head * d_v, d_model)
nn.init.xavier_normal_(self.fc.weight)
self.dropout = nn.Dropout(dropout)
def forward(self, q, k, v, mask=None):
d_k, d_v, n_head = self.d_k, self.d_v, self.n_head
sz_b, len_q, _ = q.size()
sz_b, len_k, _ = k.size()
sz_b, len_v, _ = v.size()
residual = q # 备份,用于最后与经过计算后的输出相加再输出
# 讲q、k、v转换为四维,即切分为多头
q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
# 将q、k、v中的batch_size和n_head两个维度合并,降维三维
q = q.permute(2, 0, 1, 3).contiguous().view(-1,
len_q, d_k) # (n*b) x lq x dk
k = k.permute(2, 0, 1, 3).contiguous().view(-1,
len_k, d_k) # (n*b) x lk x dk
v = v.permute(2, 0, 1, 3).contiguous().view(-1,
len_v, d_v) # (n*b) x lv x dv
# 为每个头复制一个相同的mask掩码
mask = mask.repeat(n_head, 1, 1) # (n*b) x .. x ..
output, attn = self.attention(q, k, v, mask=mask) # 计算注意力输出和attn矩阵
output = output.view(n_head, sz_b, len_q, d_v)
output = output.permute(1, 2, 0, 3).contiguous().view(
sz_b, len_q, -1) # b x lq x (n*dv) # 先还原为四维,然后再将特征维度还原,降为三维
output = self.dropout(self.fc(output))
output = self.layer_norm(output + residual) # residual
return output, attn
# 自定义FFT中基于位置的前馈神经网络/FFN,内部均使用一位卷积,也计算了残差
class PositionwiseFeedForward(nn.Module):
''' A two-feed-forward-layer module '''
def __init__(self, d_in, d_hid, dropout=0.1):
super().__init__()
# Use Conv1D
# position-wise
self.w_1 = nn.Conv1d(
d_in, d_hid, kernel_size=hp.fft_conv1d_kernel[0], padding=hp.fft_conv1d_padding[0])
# position-wise
self.w_2 = nn.Conv1d(
d_hid, d_in, kernel_size=hp.fft_conv1d_kernel[1], padding=hp.fft_conv1d_padding[1])
self.layer_norm = nn.LayerNorm(d_in)
self.dropout = nn.Dropout(dropout)
def forward(self, x):
residual = x
output = x.transpose(1, 2)
output = self.w_2(F.relu(self.w_1(output)))
output = output.transpose(1, 2)
output = self.dropout(output)
output = self.layer_norm(output + residual)
return output
Layers.py
本文件中定义了FFT块网络和后续可能会使用到一些简单模块,其中的一些模块后续也没有使用,模块的基本构造与Tocatron和Transformer-TTS相似
import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
from collections import OrderedDict
from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward
from text.symbols import symbols
# 自定义的线性全连接层
class Linear(nn.Module):
"""
Linear Module
"""
def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
"""
:param in_dim: dimension of input
:param out_dim: dimension of output
:param bias: boolean. if True, bias is included.
:param w_init: str. weight inits with xavier initialization.
"""
super(Linear, self).__init__()
self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)
nn.init.xavier_uniform_(
self.linear_layer.weight,
gain=nn.init.calculate_gain(w_init))
def forward(self, x):
return self.linear_layer(x)
# 预处理网络,ReLU激活,带有dropout的两层全连接层
class PreNet(nn.Module):
"""
Pre Net before passing through the network
"""
def __init__(self, input_size, hidden_size, output_size, p=0.5):
"""
:param input_size: dimension of input
:param hidden_size: dimension of hidden unit
:param output_size: dimension of output
"""
super(PreNet, self).__init__()
self.input_size = input_size
self.output_size = output_size
self.hidden_size = hidden_size
self.layer = nn.Sequential(OrderedDict([
('fc1', Linear(self.input_size, self.hidden_size)),
('relu1', nn.ReLU

本文详细注释了FastSpeech模型的实现,包括Transformer中的多头注意力、位置前馈神经网络、FFT块,以及LengthRegulator、Decoder和Encoder模块。FastSpeech通过PyTorch实现,数据集为LJSpeech,使用预处理网络、后处理网络和CBHG模块。模型的构建涉及编码器的编码过程、LengthRegulator的音素序列长度调整以及解码器的解码步骤。
最低0.47元/天 解锁文章
2万+

被折叠的 条评论
为什么被折叠?



