本方案直接传入PCM 数据buffer,并不从wav文件读入,更符合在线STT方式。前提是已经从fastASR源码编译出来了libfastasr.a库(参照speech to text 库FastASR交叉编译arm target的配置-优快云博客),源码示例如下:
#include "speechrecog.h"
#include <math.h>
#include <string.h>
#include <webrtc_vad.h>
#define UNTRIGGERED 0
#define TRIGGERED 1
#define SPEECH_LEN_5S (16000 * 5)
#define SPEECH_LEN_10S (16000 * 10)
#define SPEECH_LEN_20S (16000 * 20)
#define SPEECH_LEN_30S (16000 * 30)
class AudioWindow {
private:
int *window;
int in_idx;
int out_idx;
int sum;
int window_size = 0;
public:
AudioWindow(int window_size) : window_size(window_size)
{
window = (int *)calloc(sizeof(int), (window_size + 1));
in_idx = 0;
out_idx = 1;
sum = 0;
};
~AudioWindow(){};
int put(int val)
{
//cout<<"enter put"<<endl;
sum = sum + val - window[out_idx];
//cout<<"sum:"<<sum<<endl;
window[in_idx] = val;
in_idx = (in_idx == window_size) ? 0 : (in_idx + 1);
out_idx = (out_idx == window_size) ? 0 : (out_idx + 1);
//cout<<"get out put, sum:"<<sum<<endl;
return sum;
};
};
SpeechRecog::SpeechRecog()
{
mm = create_model("/data/FastASR/models/k2_rnnt2_cli", 2);
}
SpeechRecog::~SpeechRecog()
{
if(mm){
mm->reset();
mm = nullptr;
}
}
void SpeechRecog::init()
{
printf("SR init done\n");
}
string SpeechRecog::recognize(short* buffer, unsigned long framesize)
{
VadInst *handle = WebRtcVad_Create();
WebRtcVad_Init(handle);
WebRtcVad_set_mode(handle, 2);
int window_size = 10;
AudioWindow audiowindow(window_size);
int status = UNTRIGGERED;
int offset = 0;
int fs = 16000;
int step = 480;
int start = 0;
int speech_align_len = (int)(ceil((float)framesize / align_size) * align_size);
float* buf;//[speech_align_len] = {0};
float scale = 32768;
bool found = false;
int16_t *speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_align_len);
memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);
buf = (float *)malloc(sizeof(float) * speech_align_len);
memset(buf, 0, sizeof(float) * speech_align_len);
for(unsigned int i=0; i<framesize; i++){
buf[i] = (float)buffer[i] / scale;
speech_buff[i] = buffer[i];
}
result_str = "";
while (offset < (framesize - step)) {
int n = WebRtcVad_Process(handle, fs, speech_buff + offset, step);
if ((status == UNTRIGGERED) && (audiowindow.put(n) >= (window_size - 1))) {
start = offset - step * (window_size - 1);
status = TRIGGERED;
found = false;
} else if (status == TRIGGERED) {
int win_weight = audiowindow.put(n);
int voice_len = (offset - start);
int gap = 0;
if (voice_len < SPEECH_LEN_5S) {
offset += step;
continue;
} else if (voice_len < SPEECH_LEN_10S) {
gap = 1;
} else if (voice_len < SPEECH_LEN_20S) {
gap = window_size / 5;
} else {
gap = window_size / 2;
}
if (win_weight < gap) {
status = UNTRIGGERED;
float num_samples = offset - start;
float frame_length = 400;
float frame_shift = 160;
float num_new_samples =
ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length;
offset = start + num_new_samples;
mm->reset();
string msg = mm->forward(buf+start, num_new_samples, 2);//this is voice recoged buffer range
//cout<<"do while, done recog"<<endl;
result_str += msg;
found = true;
//cout<<to_string(start)<<" found: " << msg;
}
}
offset += step;
}
if (!found &&(status == TRIGGERED)){
float num_samples = offset - start;
float frame_length = 400;
float frame_shift = 160;
float num_new_samples =
ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length;
mm->reset();
string msg = mm->forward(buf+start, num_new_samples, 2);//S_begin:0, s_middle:1, s_end:2.
//cout <<"last: "<< msg;
result_str += msg;
}
/*we need to strcat all returned msg and return of one buffer in*/
free(buf);
free(speech_buff);
return result_str;
}
Cmake的配置很简单,链接fastasr fftw3f openblas webrtcvad几个库即可。但需要注意是需要将源码包的webrtc目录和Model.h拷贝到当前工程目录下,并在cmake添加add_subdirectory("./webrtc")。由于比较简单,上面源码对应的头文件就不贴了,可以自行书写。