使用pydub实现训练声音数据集加噪

在做VAD验证时,采用了真实人声数据加噪同时加标注的方式来生成验证集。

加噪的方式分为两种, 两种数据是拼接还是叠加,通过参数mix type指定

如果有需要做VAD验证集的同学,可以直接参考下面的生成代码

代码如下:

import os
import sys
import random as rd
import uuid
import time
from pydub import AudioSegment
import numpy as np

def usage():
    print("batch_speech_plus_noise <speech path> <noise path> <output path> <mix type>\n"
          "mix type: 0; not overlap when mixing;  1: overlap\n")

def append_noise_data(list, nf):
    seg = AudioSegment.from_file(file=nf, sample_width=2, frame_rate=8000, channels=1, )
    list.append(seg)

def load_all_noise(noise_list, noise_path):
    for root, dirs, files in os.walk(noise_path, topdown=False):
        for name in files:
            append_noise_data(noise_list, os.path.join(root, name))

def generate_combined_file(list, sf, name, out, type):
    '''
    generate the combine file
    :param list:  the noise file list
    :param sf:    the speech file path
    :param name:  the file name without path
    :param out:   the output path for generated file
    :param type:  0: not overlap;  1: overlap
    :return:
    '''

    ni=rd.randint(0, len(list)-1)
    noise_seg = list[ni]
    noise_ms = len(noise_seg)

    speech_seg = AudioSegment.from_file(file=sf, sample_width=2, frame_rate=8000, channels=1, )
    ms = len(speech_seg)

    if ms > noise_ms:
        return

    # the file length is double of source file
    if noise_ms > 2*ms:
        mix_ms =  2*ms
    else:
        mix_ms = noise_ms

    print("speech_seg.max_dBFS: ",speech_seg.max_dBFS, ", noise_seg.max_dBFS: ", noise_seg.max_dBFS)
    print("speech_seg.dBFS: ", speech_seg.dBFS, ", noise_seg.dBFS: ", noise_seg.dBFS)
    if speech_seg.max_dBFS < noise_seg.max_dBFS+1 and abs(speech_seg.max_dBFS-noise_seg.max_dBFS) > 1:
        gap = abs(speech_seg.max_dBFS-noise_seg.max_dBFS)+1
        # reduce volume by dB
        noise_seg = noise_seg - gap
        print("noise_seg.apply_gain(", gap, ")")

    nstart=rd.randint(0,noise_ms-mix_ms-1)
    start=rd.randint(nstart,nstart+mix_ms-ms-1)
    print("the mixed file length : ", mix_ms,"ms")
    print("the mixed file start from ", nstart, "ms")
    print("the vad start time from ", start, "ms, and it is starting from ", start-nstart, "ms based on output file.")
    print("the vad stop time from ",start-nstart+ms, "ms")

    nm, ext = os.path.splitext(name)
    id = str(uuid.uuid1())[0:8]
    new_name=nm+"_"+id+ext
    new_file = os.path.join(out, new_name)
    new_name2=nm+"_"+id+".label"
    new_file2 = os.path.join(out, new_name2)

    f1= open(new_file, 'wb')
    part0 = np.array(noise_seg[nstart:start].get_array_of_samples())
    f1.write(part0.tobytes())
    if type == 0:
        part1 = np.array(speech_seg.get_array_of_samples())
    else:
        a = np.array(speech_seg.get_array_of_samples())
        b = np.array(noise_seg[start:start+ms].get_array_of_samples())
        part1 = a+b
    f1.write(part1.tobytes())
    part2 = np.array(noise_seg[start+ms:nstart+mix_ms].get_array_of_samples())
    f1.write(part2.tobytes())
    f1.close()

    f2 = open(new_file2, 'w')
    f2.write(str(start-nstart))
    f2.write(",")
    f2.write(str(start-nstart+ms))
    f2.close()

if len(sys.argv) != 5:
    usage()
    exit(-1)

speech_path=sys.argv[1]
noise_path=sys.argv[2]
output_path=sys.argv[3]
mix_type = int(sys.argv[4])
noise_list=[]


rd.seed(int(time.time()))

load_all_noise(noise_list, noise_path)

for root, dirs, files in os.walk(speech_path):
    for name in files:
        generate_combined_file(noise_list, os.path.join(root, name), name,output_path, mix_type)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值