fastspeech复现github项目--模型构建

原创

已于 2022-09-14 23:13:15 修改 · 1.7k 阅读

0 ·

CC 4.0 BY-SA版权

文章标签：

#深度学习 #人工智能

于 2022-08-22 23:04:41 首次发布

本文详细注释了FastSpeech模型的实现，包括Transformer中的多头注意力、位置前馈神经网络、FFT块，以及LengthRegulator、Decoder和Encoder模块。FastSpeech通过PyTorch实现，数据集为LJSpeech，使用预处理网络、后处理网络和CBHG模块。模型的构建涉及编码器的编码过程、LengthRegulator的音素序列长度调整以及解码器的解码步骤。

在完成fastspeech论文学习后，对github上一个复现的仓库进行学习，帮助理解算法实现过程中的一些细节；所选择的仓库复现仓库是基于pytorch实现，链接为https://github.com/xcmyz/FastSpeech。该仓库使用的数据集为LJSpeech，数据处理部分的代码见笔记“fastspeech复现github项目–数据准备”。本笔记对模型搭建的代码进行详细的注释，项目中与模型构建相关的是transformer路径下文件以及modules.py和model.py，希望本笔记中的代码注释能帮助读者理解FastSpeech。

transformer

因FastSpeech中FFT块和用于训练duration predictor的对齐信息提取都与transformer相关的，本项目把相关的文件都放在了transformer路径下

Constants.py

本文件中设置数据处理中可能会使用到一些常量

PAD = 0
UNK = 1
BOS = 2
EOS = 3

PAD_WORD = '<blank>'
UNK_WORD = '<unk>'
BOS_WORD = '<s>'
EOS_WORD = '</s>'

Modules.py

本文件中是实现了一个点乘模块，用于计算注意力

import torch
import torch.nn as nn
import numpy as np


# 自定义的点乘模块，用于计算注意力
class ScaledDotProductAttention(nn.Module):
    ''' Scaled Dot-Product Attention '''

    def __init__(self, temperature, attn_dropout=0.1):
        super().__init__()
        self.temperature = temperature
        self.dropout = nn.Dropout(attn_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, mask=None):
        # 对q、k进行矩阵乘法
        attn = torch.bmm(q, k.transpose(1, 2))
        attn = attn / self.temperature  # 除以温度参数

        if mask is not None:
            attn = attn.masked_fill(mask, -np.inf)

        attn = self.softmax(attn)
        attn = self.dropout(attn)
        output = torch.bmm(attn, v)

        return output, attn

SubLayers.py

本文件中定义了FFT块中的多头注意力层和基于位置的前馈神经网络(FFN)

import torch.nn as nn
import torch.nn.functional as F
import numpy as np

from transformer.Modules import ScaledDotProductAttention
import hparams as hp


# 自定义的多头注意力模块
class MultiHeadAttention(nn.Module):
    ''' Multi-Head Attention module '''

    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
        super().__init__()

        self.n_head = n_head
        self.d_k = d_k
        self.d_v = d_v
        # 计算注意力之前，对q、k、v进行映射的线性层
        self.w_qs = nn.Linear(d_model, n_head * d_k)
        self.w_ks = nn.Linear(d_model, n_head * d_k)
        self.w_vs = nn.Linear(d_model, n_head * d_v)
        # 初始化上述线性层
        nn.init.normal_(self.w_qs.weight, mean=0,
                        std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_ks.weight, mean=0,
                        std=np.sqrt(2.0 / (d_model + d_k)))
        nn.init.normal_(self.w_vs.weight, mean=0,
                        std=np.sqrt(2.0 / (d_model + d_v)))

        self.attention = ScaledDotProductAttention(
            temperature=np.power(d_k, 0.5))
        self.layer_norm = nn.LayerNorm(d_model)

        self.fc = nn.Linear(n_head * d_v, d_model)
        nn.init.xavier_normal_(self.fc.weight)

        self.dropout = nn.Dropout(dropout)

    def forward(self, q, k, v, mask=None):

        d_k, d_v, n_head = self.d_k, self.d_v, self.n_head

        sz_b, len_q, _ = q.size()
        sz_b, len_k, _ = k.size()
        sz_b, len_v, _ = v.size()

        residual = q  # 备份，用于最后与经过计算后的输出相加再输出
        # 讲q、k、v转换为四维，即切分为多头
        q = self.w_qs(q).view(sz_b, len_q, n_head, d_k)
        k = self.w_ks(k).view(sz_b, len_k, n_head, d_k)
        v = self.w_vs(v).view(sz_b, len_v, n_head, d_v)
        # 将q、k、v中的batch_size和n_head两个维度合并，降维三维
        q = q.permute(2, 0, 1, 3).contiguous().view(-1,
                                                    len_q, d_k)  # (n*b) x lq x dk
        k = k.permute(2, 0, 1, 3).contiguous().view(-1,
                                                    len_k, d_k)  # (n*b) x lk x dk
        v = v.permute(2, 0, 1, 3).contiguous().view(-1,
                                                    len_v, d_v)  # (n*b) x lv x dv
        # 为每个头复制一个相同的mask掩码
        mask = mask.repeat(n_head, 1, 1)  # (n*b) x .. x ..
        output, attn = self.attention(q, k, v, mask=mask)  # 计算注意力输出和attn矩阵

        output = output.view(n_head, sz_b, len_q, d_v)
        output = output.permute(1, 2, 0, 3).contiguous().view(
            sz_b, len_q, -1)  # b x lq x (n*dv)  # 先还原为四维，然后再将特征维度还原，降为三维

        output = self.dropout(self.fc(output))
        output = self.layer_norm(output + residual)  # residual

        return output, attn


# 自定义FFT中基于位置的前馈神经网络/FFN，内部均使用一位卷积，也计算了残差
class PositionwiseFeedForward(nn.Module):
    ''' A two-feed-forward-layer module '''

    def __init__(self, d_in, d_hid, dropout=0.1):
        super().__init__()

        # Use Conv1D
        # position-wise
        self.w_1 = nn.Conv1d(
            d_in, d_hid, kernel_size=hp.fft_conv1d_kernel[0], padding=hp.fft_conv1d_padding[0])
        # position-wise
        self.w_2 = nn.Conv1d(
            d_hid, d_in, kernel_size=hp.fft_conv1d_kernel[1], padding=hp.fft_conv1d_padding[1])

        self.layer_norm = nn.LayerNorm(d_in)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        residual = x
        output = x.transpose(1, 2)
        output = self.w_2(F.relu(self.w_1(output)))
        output = output.transpose(1, 2)
        output = self.dropout(output)
        output = self.layer_norm(output + residual)

        return output

Layers.py

本文件中定义了FFT块网络和后续可能会使用到一些简单模块，其中的一些模块后续也没有使用，模块的基本构造与Tocatron和Transformer-TTS相似

import torch
import torch.nn as nn
from torch.nn import functional as F
import numpy as np
from collections import OrderedDict

from transformer.SubLayers import MultiHeadAttention, PositionwiseFeedForward
from text.symbols import symbols


# 自定义的线性全连接层
class Linear(nn.Module):
    """
    Linear Module
    """

    def __init__(self, in_dim, out_dim, bias=True, w_init='linear'):
        """
        :param in_dim: dimension of input
        :param out_dim: dimension of output
        :param bias: boolean. if True, bias is included.
        :param w_init: str. weight inits with xavier initialization.
        """
        super(Linear, self).__init__()
        self.linear_layer = nn.Linear(in_dim, out_dim, bias=bias)

        nn.init.xavier_uniform_(
            self.linear_layer.weight,
            gain=nn.init.calculate_gain(w_init))

    def forward(self, x):
        return self.linear_layer(x)


# 预处理网络，ReLU激活，带有dropout的两层全连接层
class PreNet(nn.Module):
    """
    Pre Net before passing through the network
    """

    def __init__(self, input_size, hidden_size, output_size, p=0.5):
        """
        :param input_size: dimension of input
        :param hidden_size: dimension of hidden unit
        :param output_size: dimension of output
        """
        super(PreNet, self).__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_size = hidden_size
        self.layer = nn.Sequential(OrderedDict([
            ('fc1', Linear(self.input_size, self.hidden_size)),
            ('relu1', nn.ReLU