使用pydub实现训练声音数据集加噪

最新推荐文章于 2023-07-20 17:51:29 发布

安安爸Chris

最新推荐文章于 2023-07-20 17:51:29 发布

阅读量651

点赞数

分类专栏：深度学习

本文链接：https://blog.youkuaiyun.com/mimiduck/article/details/117068894

版权

深度学习专栏收录该内容

28 篇文章

订阅专栏

在做VAD验证时，采用了真实人声数据加噪同时加标注的方式来生成验证集。

加噪的方式分为两种，两种数据是拼接还是叠加，通过参数mix type指定

如果有需要做VAD验证集的同学，可以直接参考下面的生成代码

代码如下：

import os
import sys
import random as rd
import uuid
import time
from pydub import AudioSegment
import numpy as np

def usage():
    print("batch_speech_plus_noise <speech path> <noise path> <output path> <mix type>\n"
          "mix type: 0; not overlap when mixing;  1: overlap\n")

def append_noise_data(list, nf):
    seg = AudioSegment.from_file(file=nf, sample_width=2, frame_rate=8000, channels=1, )
    list.append(seg)

def load_all_noise(noise_list, noise_path):
    for root, dirs, files in os.walk(noise_path, topdown=False):
        for name in files:
            append_noise_data(noise_list, os.path.join(root, name))

def generate_combined_file(list, sf, name, out, type):
    '''
    generate the combine file
    :param list:  the noise file list
    :param sf:    the speech file path
    :param name:  the file name without path
    :param out:   the output path for generated file
    :param type:  0: not overlap;  1: overlap
    :return:
    '''

    ni=rd.randint(0, len(list)-1)
    noise_seg = list[ni]
    noise_ms = len(noise_seg)

    speech_seg = AudioSegment.from_file(file=sf, sample_width=2, frame_rate=8000, channels=1, )
    ms = len(speech_seg)

    if ms > noise_ms:
        return

    # the file length is double of source file
    if noise_ms > 2*ms:
        mix_ms =  2*ms
    else:
        mix_ms = noise_ms

    print("speech_seg.max_dBFS: ",speech_seg.max_dBFS, ", noise_seg.max_dBFS: ", noise_seg.max_dBFS)
    print("speech_seg.dBFS: ", speech_seg.dBFS, ", noise_seg.dBFS: ", noise_seg.dBFS)
    if speech_seg.max_dBFS < noise_seg.max_dBFS+1 and abs(speech_seg.max_dBFS-noise_seg.max_dBFS) > 1:
        gap = abs(speech_seg.max_dBFS-noise_seg.max_dBFS)+1
        # reduce volume by dB
        noise_seg = noise_seg - gap
        print("noise_seg.apply_gain(", gap, ")")

    nstart=rd.randint(0,noise_ms-mix_ms-1)
    start=rd.randint(nstart,nstart+mix_ms-ms-1)
    print("the mixed file length : ", mix_ms,"ms")
    print("the mixed file start from ", nstart, "ms")
    print("the vad start time from ", start, "ms, and it is starting from ", start-nstart, "ms based on output file.")
    print("the vad stop time from ",start-nstart+ms, "ms")

    nm, ext = os.path.splitext(name)
    id = str(uuid.uuid1())[0:8]
    new_name=nm+"_"+id+ext
    new_file = os.path.join(out, new_name)
    new_name2=nm+"_"+id+".label"
    new_file2 = os.path.join(out, new_name2)

    f1= open(new_file, 'wb')
    part0 = np.array(noise_seg[nstart:start].get_array_of_samples())
    f1.write(part0.tobytes())
    if type == 0:
        part1 = np.array(speech_seg.get_array_of_samples())
    else:
        a = np.array(speech_seg.get_array_of_samples())
        b = np.array(noise_seg[start:start+ms].get_array_of_samples())
        part1 = a+b
    f1.write(part1.tobytes())
    part2 = np.array(noise_seg[start+ms:nstart+mix_ms].get_array_of_samples())
    f1.write(part2.tobytes())
    f1.close()

    f2 = open(new_file2, 'w')
    f2.write(str(start-nstart))
    f2.write(",")
    f2.write(str(start-nstart+ms))
    f2.close()

if len(sys.argv) != 5:
    usage()
    exit(-1)

speech_path=sys.argv[1]
noise_path=sys.argv[2]
output_path=sys.argv[3]
mix_type = int(sys.argv[4])
noise_list=[]


rd.seed(int(time.time()))

load_all_noise(noise_list, noise_path)

for root, dirs, files in os.walk(speech_path):
    for name in files:
        generate_combined_file(noise_list, os.path.join(root, name), name,output_path, mix_type)