VAD语音分割算法原理与简单实现

0 简介

 VAD也就是语音端点检测技术,是Voice Activity Detection的缩写。这个技术的主要任务是从带有噪声的语音中准确的定位出语音的开始和结束点

1 自己编写的基于幅数值与过0率实时检测分割录音简单实现,麦克风实时输入。

#coding=utf-8
""" Test on random audio from dataset and visualize the attention matrix.
"""

# from lib2to3.pgen2 import token
# import torch
import os
import numpy as np
import argparse
# import torchaudio
import time
# import build_model

import pyaudio
import wave
import numpy as np

CHUNK = 2000
FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
WAVE_OUTPUT_FILENAME = "cache.wav"

p = pyaudio.PyAudio()

def start_stream(id):#数据流
    stream = p.open(format=FORMAT,
                    channels=CHANNELS,
                    rate=RATE,
                    input=True,
                    frames_per_buffer=CHUNK,
                    input_device_index=id
                    )
    return stream

# 自定义函数,计算数值的符号。
def sgn(data):
    if data >= 0 :
        return 1
    else :
        return 0
#计算过零率
def calZeroCrossingRate(wave_data) :
    zeroCrossingRate = []
    sum = 0
    for i in range(len(wave_data)) :
        sum = sum + np.abs(sgn(wave_data[i]) - sgn(wave_data[i - 1]))
    return sum


def Monitor(stream : pyaudio.Stream):
    
    frames = []
    while (True):
        data = stream.read(CHUNK,exception_on_overflow=False)
        audio_data = np.fromstring(data, dtype=np.short)
        # print(calZeroCrossingRate(audio_data))
        temp = np.max(np.abs(audio_data))
        zerorate=calZeroCrossingRate(audio_data)/2000
        print('当前阈值:',temp)

        if temp > 15000 :
         if(zerorate)<0.1:
            print("检测到人声!开始缓存录音")
            frames.append(data)
            # time.sleep(0.5)
            break
    while (True):
        data = stream.read(CHUNK,exception_on_overflow=False)
        audio_data = np.fromstring(data, dtype=np.short)
        frames.append(data)
        temp = np.max(np.abs(audio_data))
        print('当前阈值:',temp) 
        if temp < 2000 :
            # time.sleep(0.5)
            break
    print("结束录音!")
    wf = wave.open(WAVE_OUTPUT_FILENAME, 'wb')
    wf.setnchannels(CHANNELS)
    wf.setsampwidth(p.get_sample_size(FORMAT))
    wf.setframerate(RATE)
    wf.writeframes(b''.join(frames))
    wf.close()

def main():
    
    parser = argparse.ArgumentParser(description="Test on random audio from dataset and visualize the attention matrix.")
    # parser.add_argument('ckpt', type=str, help="Checkpoint to restore.")
    # parser.add_argument('--beams', default=1, type=int, help="Beam Search width.")
    parser.add_argument('--micId', required=True, type=int)
    # parser.add_argument('--tokenizer', required=True, type=str, help="tokenizerPath.")
    args = parser.parse_args()
    stream = start_stream(args.micId)
    while(1):
        Monitor(stream)

if __name__ == '__main__':
    main()

 2 wav格式输入

#coding=utf-8
""" Test on random audio from dataset and visualize the attention matrix.
"""

# from lib2to3.pgen2 import token
# import torch
import os
import numpy as np
import argparse
# import torchaudio
import time
# import build_model
import math
import wave
import numpy as np

# 自定义函数,计算数值的符号。
def sgn(data):
    if data >= 0 :
        return 1
    else :
        return 0
#计算过零率
def calZeroCrossingRate(wave_data) :
    zeroCrossingRate = []
    sum = 0
    for i in range(len(wave_data)) :
        sum = sum + np.abs(sgn(wave_data[i]) - sgn(wave_data[i - 1]))
    return sum


def main():
    
    frames = []
    wavread = wave.open(r'/home/sunshine/桌面/code_C_PY_2022/py/8.pyaudio看麦wave录音/nr降噪.wav','rb')
    fs = wavread.getframerate() #sampling freqency
    print("lg------getframerate",type(fs),fs)
    Nwavlen = wavread.getnframes() #num of total audio data points
    print("lg------getnframes",type(Nwavlen),Nwavlen)
    Nchannel = wavread.getnchannels() #num of channels
    print("lg------Nchannel",type(Nchannel),Nchannel)
    wav_str = wavread.readframes(Nwavlen)#核心的读

    wav_int = np.frombuffer(wav_str, dtype=np.int16)#转文件数据格式
    print(wav_int.shape)

    print(np.max(wav_int))
    for i in (range(wav_int.size)[0::800]):
        if(np.max(wav_int[i:i+800])>1000 and (calZeroCrossingRate(wav_int[i:i+800])/2000)<0.15):
            print("语音片段",i/8000,'s')
        i+=800

if __name__ == '__main__':
    main()

3 基于webrtc vad算法

'''
Requirements:
+ pyaudio - `pip install pyaudio`
+ py-webrtcvad - `pip install webrtcvad`

'''
import webrtcvad
import collections
import sys
import signal
import pyaudio

from array import array
from struct import pack
import wave
import time

FORMAT = pyaudio.paInt16
CHANNELS = 1
RATE = 16000
CHUNK_DURATION_MS = 30       # supports 10, 20 and 30 (ms)
PADDING_DURATION_MS = 1500   # 1 sec jugement
CHUNK_SIZE = int(RATE * CHUNK_DURATION_MS / 1000)  # chunk to read
CHUNK_BYTES = CHUNK_SIZE * 2  # 16bit = 2 bytes, PCM
NUM_PADDING_CHUNKS = int(PADDING_DURATION_MS / CHUNK_DURATION_MS)
# NUM_WINDOW_CHUNKS = int(240 / CHUNK_DURATION_MS)
NUM_WINDOW_CHUNKS = int(400 / CHUNK_DURATION_MS)  # 400 ms/ 30ms  ge
NUM_WINDOW_CHUNKS_END = NUM_WINDOW_CHUNKS * 2

START_OFFSET = int(NUM_WINDOW_CHUNKS * CHUNK_DURATION_MS * 0.5 * RATE)

vad = webrtcvad.Vad(1)

pa = pyaudio.PyAudio()
stream = pa.open(format=FORMAT,
                 channels=CHANNELS,
                 rate=RATE,
                 input=True,
                 start=False,
                 input_device_index=6,#################麦克风ID
                 frames_per_buffer=CHUNK_SIZE)


got_a_sentence = False
leave = False


def handle_int(sig, chunk):
    global leave, got_a_sentence
    leave = True
    got_a_sentence = True


def record_to_file(path, data, sample_width):
    "Records from the microphone and outputs the resulting data to 'path'"
    # sample_width, data = record()
    data = pack('<' + ('h' * len(data)), *data)
    wf = wave.open(path, 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(sample_width)
    wf.setframerate(RATE)
    wf.writeframes(data)
    wf.close()


def normalize(snd_data):
    "Average the volume out"
    MAXIMUM = 32767  # 16384
    times = float(MAXIMUM) / max(abs(i) for i in snd_data)
    r = array('h')
    for i in snd_data:
        r.append(int(i * times))
    return r

signal.signal(signal.SIGINT, handle_int)

while not leave:
    ring_buffer = collections.deque(maxlen=NUM_PADDING_CHUNKS)
    triggered = False
    voiced_frames = []
    ring_buffer_flags = [0] * NUM_WINDOW_CHUNKS
    ring_buffer_index = 0

    ring_buffer_flags_end = [0] * NUM_WINDOW_CHUNKS_END
    ring_buffer_index_end = 0
    buffer_in = ''
    # WangS
    raw_data = array('h')
    index = 0
    start_point = 0
    StartTime = time.time()
    print("* recording: ")
    stream.start_stream()

    while not got_a_sentence and not leave:
        chunk = stream.read(CHUNK_SIZE)
        # add WangS
        raw_data.extend(array('h', chunk))
        index += CHUNK_SIZE
        TimeUse = time.time() - StartTime

        active = vad.is_speech(chunk, RATE)

        sys.stdout.write('1' if active else '_')
        ring_buffer_flags[ring_buffer_index] = 1 if active else 0
        ring_buffer_index += 1
        ring_buffer_index %= NUM_WINDOW_CHUNKS

        ring_buffer_flags_end[ring_buffer_index_end] = 1 if active else 0
        ring_buffer_index_end += 1
        ring_buffer_index_end %= NUM_WINDOW_CHUNKS_END

        # start point detection
        if not triggered:
            ring_buffer.append(chunk)
            num_voiced = sum(ring_buffer_flags)
            if num_voiced > 0.8 * NUM_WINDOW_CHUNKS:
                sys.stdout.write(' Open ')
                triggered = True
                start_point = index - CHUNK_SIZE * 20  # start point
                # voiced_frames.extend(ring_buffer)
                ring_buffer.clear()
        # end point detection
        else:
            # voiced_frames.append(chunk)
            ring_buffer.append(chunk)
            num_unvoiced = NUM_WINDOW_CHUNKS_END - sum(ring_buffer_flags_end)
            if num_unvoiced > 0.90 * NUM_WINDOW_CHUNKS_END or TimeUse > 10:
                sys.stdout.write(' Close ')
                triggered = False
                got_a_sentence = True

        sys.stdout.flush()

    sys.stdout.write('\n')
    # data = b''.join(voiced_frames)

    stream.stop_stream()
    print("* done recording")
    got_a_sentence = False

    # write to file
    raw_data.reverse()
    for index in range(start_point):
        raw_data.pop()
    raw_data.reverse()
    raw_data = normalize(raw_data)
    record_to_file("recording.wav", raw_data, 2)
    leave = True

stream.close()

# MOS-PESQ The project is a tool that can get MOS(PESQ) score for the voice. PESQ measure: ------------- Usage of the PESQ objective measure is as follows: [pesq_mos]=pesq(cleanfile.wav,enhanced.wav) where 'cleanfile.wav' contains the clean speech file and 'enhanced.wav' contains the enhanced file. Example: To run the PESQ objective measure with the example files provided, type in MATLAB: >> pesq('sp09.wav','enhanced_logmmse.wav') ans = 2.2557 Source code for the PESQ implementation is available from a CD-ROM included in the following book: Loizou, P. (2007) "Speech enhancement: Theory and Practice", CRC Press. COMPOSITE MEASURE: ----------------- Usage: [Csig,Cbak,Covl]=composite(cleanfile.wav,enhanced.wav) where 'Csig' is the predicted rating of speech distortion 'Cbak' is the predicted rating of background distortion 'Covl' is the predicted rating of overall quality. You may run example files included in the zip file. In MATLAB, type: >> [c,b,o]=composite('sp09.wav','enhanced_logmmse.wav') LLR=0.681368 SNRseg=3.991727 WSS=49.671978 PESQ=2.255732 c = 3.3050 b = 2.6160 o = 2.7133 where 'sp09.wav' is the clean file and 'enhanced_logmmse.wav' is the enhanced file. The predicted ratings for overall quality was 2.7133, for background was 2.61 and for signal distortion it was 3.3050. Operating steps: ----------------- >> ./matlab-PESQ/readme.txt Thank: ----------------- Any questions, please E_mail: kinglongbest@163.com/245051943@qq.com 操作步骤 1.将所录序列加载如当前工作路径,也可以按自己工作路径自行加载; 2.在read.m中修改参考序列,默认为ref.wav,16KHz采样; 3.利用wavdivide.m对所录多组序列文件进行拆分(支持多种采样频率),并按序保证至当前路径; 4.运行tongji.m计算PESQ_MOS得分并通过excel/txt输出至指定路径; NOTE: 对于步骤4,每次执行记得修改excel中输出列位置,如cellnames2=['B',num2str(k+1),':B',num2str(k+1)];, 指定写入B列,下次执行改为C列,以此类推; 其中ref_8k.wav为8KHz采样测试序列,ref.wav为16KHz,ref_3s.wav只是为方便测试在ref.wav语音前加3s静音;
VAD(Voice Activity Detection)算法语音信号处理中的重要一环,它可以检测语音信号中有声段和无声段的切换点,从而对语音信号进行分帧、降噪、增益等处理。在Verilog中实现VAD算法,需要先了解VAD算法的基本原理和流程,再根据需求选择相应的算法实现方式。 VAD算法的基本原理是通过对语音信号的能量、过零率、频谱等特征进行分析,判断当前语音信号是否处于有声段或无声段。常用的VAD算法包括能量门限法、过零率门限法、频率域法等。 在Verilog中实现VAD算法的具体步骤如下: 1. 读取语音信号采样数据,并进行预处理,如去除直流分量、降噪等。 2. 对语音信号进行分帧,一般采用20ms或30ms的帧长,每帧的采样数据量为160或240个采样点。 3. 对每帧语音信号进行特征提取,常用的特征包括能量、过零率、频率谱等。 4. 根据特征值计算判断门限,对当前帧的语音信号进行分类,判断其是否为有声段或无声段。 5. 根据有声段和无声段的切换点,进行语音信号的分割,得到语音段。 6. 根据需求进行后续处理,如降噪、增益等。 以上是VAD算法在Verilog中的基本实现流程,具体实现方式可以根据需求进行选择。需要注意的是,Verilog是硬件描述语言,主要用于数字电路的设计和实现,因此在实现VAD算法时需要考虑其硬件结构和资源消耗情况。
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

李皮皮的悲惨生活

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值