在做VAD验证时,采用了真实人声数据加噪同时加标注的方式来生成验证集。
加噪的方式分为两种, 两种数据是拼接还是叠加,通过参数mix type指定
如果有需要做VAD验证集的同学,可以直接参考下面的生成代码
代码如下:
import os
import sys
import random as rd
import uuid
import time
from pydub import AudioSegment
import numpy as np
def usage():
print("batch_speech_plus_noise <speech path> <noise path> <output path> <mix type>\n"
"mix type: 0; not overlap when mixing; 1: overlap\n")
def append_noise_data(list, nf):
seg = AudioSegment.from_file(file=nf, sample_width=2, frame_rate=8000, channels=1, )
list.append(seg)
def load_all_noise(noise_list, noise_path):
for root, dirs, files in os.walk(noise_path, topdown=False):
for name in files:
append_noise_data(noise_list, os.path.join(root, name))
def generate_combined_file(list, sf, name, out, type):
'''
generate the combine file
:param list: the noise file list
:param sf: the speech file path
:param name: the file name without path
:param out: the output path for generated file
:param type: 0: not overlap; 1: overlap
:return:
'''
ni=rd.randint(0, len(list)-1)
noise_seg = list[ni]
noise_ms = len(noise_seg)
speech_seg = AudioSegment.from_file(file=sf, sample_width=2, frame_rate=8000, channels=1, )
ms = len(speech_seg)
if ms > noise_ms:
return
# the file length is double of source file
if noise_ms > 2*ms:
mix_ms = 2*ms
else:
mix_ms = noise_ms
print("speech_seg.max_dBFS: ",speech_seg.max_dBFS, ", noise_seg.max_dBFS: ", noise_seg.max_dBFS)
print("speech_seg.dBFS: ", speech_seg.dBFS, ", noise_seg.dBFS: ", noise_seg.dBFS)
if speech_seg.max_dBFS < noise_seg.max_dBFS+1 and abs(speech_seg.max_dBFS-noise_seg.max_dBFS) > 1:
gap = abs(speech_seg.max_dBFS-noise_seg.max_dBFS)+1
# reduce volume by dB
noise_seg = noise_seg - gap
print("noise_seg.apply_gain(", gap, ")")
nstart=rd.randint(0,noise_ms-mix_ms-1)
start=rd.randint(nstart,nstart+mix_ms-ms-1)
print("the mixed file length : ", mix_ms,"ms")
print("the mixed file start from ", nstart, "ms")
print("the vad start time from ", start, "ms, and it is starting from ", start-nstart, "ms based on output file.")
print("the vad stop time from ",start-nstart+ms, "ms")
nm, ext = os.path.splitext(name)
id = str(uuid.uuid1())[0:8]
new_name=nm+"_"+id+ext
new_file = os.path.join(out, new_name)
new_name2=nm+"_"+id+".label"
new_file2 = os.path.join(out, new_name2)
f1= open(new_file, 'wb')
part0 = np.array(noise_seg[nstart:start].get_array_of_samples())
f1.write(part0.tobytes())
if type == 0:
part1 = np.array(speech_seg.get_array_of_samples())
else:
a = np.array(speech_seg.get_array_of_samples())
b = np.array(noise_seg[start:start+ms].get_array_of_samples())
part1 = a+b
f1.write(part1.tobytes())
part2 = np.array(noise_seg[start+ms:nstart+mix_ms].get_array_of_samples())
f1.write(part2.tobytes())
f1.close()
f2 = open(new_file2, 'w')
f2.write(str(start-nstart))
f2.write(",")
f2.write(str(start-nstart+ms))
f2.close()
if len(sys.argv) != 5:
usage()
exit(-1)
speech_path=sys.argv[1]
noise_path=sys.argv[2]
output_path=sys.argv[3]
mix_type = int(sys.argv[4])
noise_list=[]
rd.seed(int(time.time()))
load_all_noise(noise_list, noise_path)
for root, dirs, files in os.walk(speech_path):
for name in files:
generate_combined_file(noise_list, os.path.join(root, name), name,output_path, mix_type)