声谱特征提取与恢复

最新推荐文章于 2024-01-02 17:45:25 发布

lawsX云

最新推荐文章于 2024-01-02 17:45:25 发布

阅读量1.8k

点赞数 1

CC 4.0 BY-SA版权

本文链接：https://blog.youkuaiyun.com/Boogyman/article/details/91378738

def get_spectrograms(fpath):
   '''Returns normalized log(melspectrogram) and log(magnitude) from `sound_file`.
   Args:
     sound_file: A string. The full path of a sound file.

   Returns:
     mel: A 2d array of shape (T, n_mels) <- Transposed
     mag: A 2d array of shape (T, 1+n_fft/2) <- Transposed
   '''

   y, sr = librosa.load(fpath, sr=hp.sr)

   # Trimming
   y, _ = librosa.effects.trim(y)

   # Preemphasis
   y = np.append(y[0], y[1:] - hp.preemphasis * y[:-1])

   # stft
   linear = librosa.stft(y=y,
                         n_fft=hp.n_fft,
                         hop_length=hp.hop_length,
                         win_length=hp.win_length)

   # magnitude spectrogram
   mag = np.abs(linear)  # (1+n_fft//2, T)

   # mel spectrogram
   mel_basis = librosa.filters.mel(hp.sr, hp.n_fft, hp.n_mels)  # (n_mels, 1+n_fft//2)
   mel = np.dot(mel_basis, mag)  # (n_mels, t)

   # to decibel
   mel = 20 * np.log10(np.maximum(1e-5, mel))
   mag = 20 * np.log10(np.maximum(1e-5, mag))

   # normalize
   mel = np.clip((mel - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)
   mag = np.clip((mag - hp.ref_db + hp.max_db) / hp.max_db, 1e-8, 1)

   # Transpose
   mel = mel.T.astype(np.float32)  # (T, n_mels)
   mag = mag.T.astype(np.float32)  # (T, 1+n_fft//2)

   return mel, mag

def spectrogram2wav(mag):
   '''# Generate wave file from spectrogram'''
   # transpose
   mag = mag.T

   # de-noramlize
   mag = (np.clip(mag, 0, 1) * hp.max_db) - hp.max_db + hp.ref_db

   # to amplitude
   mag = np.power(10.0, mag * 0.05)

   # wav reconstruction
   wav = griffin_lim(mag)

   # de-preemphasis
   wav = signal.lfilter([1], [1, -hp.preemphasis], wav)

   # trim
   wav, _ = librosa.effects.trim(wav)

   return wav.astype(np.float32)


def griffin_lim(spectrogram):
   '''Applies Griffin-Lim's raw.
   '''
   X_best = copy.deepcopy(spectrogram)
   for i in range(hp.n_iter):
       X_t = invert_spectrogram(X_best)
       est = librosa.stft(X_t, hp.n_fft, hp.hop_length, win_length=hp.win_length)
       phase = est / np.maximum(1e-8, np.abs(est))
       X_best = spectrogram * phase
   X_t = invert_spectrogram(X_best)
   y = np.real(X_t)

   return y

超参数设置：


# -*- coding: utf-8 -*-
#/usr/bin/python2
'''
By kyubyong park. kbpark.linguist@gmail.com. 
https://www.github.com/kyubyong/tacotron
'''
class Hyperparams:
    '''Hyper parameters'''
    
    # pipeline
    prepro = False  # if True, run `python prepro.py` first before running `python train.py`.

    vocab = "PE abcdefghijklmnopqrstuvwxyz'.?" # P: Padding E: End of Sentence

    # data
    data = "./data/private/voice/LJSpeech-1.1"
    # data = "/data/private/voice/nick"
    test_data = './harvard_sentences.txt'
    max_duration = 10.0

    # signal processing
    sr = 22050 # Sample rate.
    n_fft = 2048 # fft points (samples)
    frame_shift = 0.0125 # seconds
    frame_length = 0.05 # seconds
    hop_length = int(sr*frame_shift) # samples.
    win_length = int(sr*frame_length) # samples.
    n_mels = 80 # Number of Mel banks to generate
    power = 1.2 # Exponent for amplifying the predicted magnitude
    n_iter = 50 # Number of inversion iterations
    preemphasis = .97 # or None
    max_db = 100
    ref_db = 20

    # model
    embed_size = 256 # alias = E
    encoder_num_banks = 16
    decoder_num_banks = 8
    num_highwaynet_blocks = 4
    r = 5 # Reduction factor. Paper => 2, 3, 5
    dropout_rate = .5

    # training scheme
    lr = 0.001 # Initial learning rate.
    logdir = "./logdir/01"
    sampledir = './samples'
    batch_size = 32