speech to text 库fastASR接口调用示例C++

宇宙研究小组

已于 2024-04-25 17:14:27 修改

阅读量287

点赞数 7

文章标签： c++ 开发语言 FASTASR speech to text

于 2024-04-10 11:05:29 首次发布

本文链接：https://blog.youkuaiyun.com/baidu_16271159/article/details/137588340

版权

本方案直接传入PCM 数据buffer，并不从wav文件读入，更符合在线STT方式。前提是已经从fastASR源码编译出来了libfastasr.a库（参照speech to text 库FastASR交叉编译arm target的配置-优快云博客），源码示例如下：

#include "speechrecog.h"
#include <math.h>
#include <string.h>
#include <webrtc_vad.h>

#define UNTRIGGERED 0
#define TRIGGERED   1

#define SPEECH_LEN_5S  (16000 * 5)
#define SPEECH_LEN_10S (16000 * 10)
#define SPEECH_LEN_20S (16000 * 20)
#define SPEECH_LEN_30S (16000 * 30)
class AudioWindow {
  private:
    int *window;
    int in_idx;
    int out_idx;
    int sum;
    int window_size = 0;

  public:
    AudioWindow(int window_size) : window_size(window_size)
    {
        window = (int *)calloc(sizeof(int), (window_size + 1));
        in_idx = 0;
        out_idx = 1;
        sum = 0;
    };
    ~AudioWindow(){};
    int put(int val)
    {
        //cout<<"enter put"<<endl;
        sum = sum + val - window[out_idx];
        //cout<<"sum:"<<sum<<endl;
        window[in_idx] = val;
        in_idx = (in_idx == window_size) ? 0 : (in_idx + 1);
        out_idx = (out_idx == window_size) ? 0 : (out_idx + 1);
        //cout<<"get out put, sum:"<<sum<<endl;
        return sum;
    };
};
SpeechRecog::SpeechRecog()
{
    mm = create_model("/data/FastASR/models/k2_rnnt2_cli", 2);
}
SpeechRecog::~SpeechRecog()
{
    if(mm){
        mm->reset();
        mm = nullptr;
    }
}

void SpeechRecog::init()
{
    printf("SR init done\n");
}

string SpeechRecog::recognize(short* buffer, unsigned long framesize)
{
    VadInst *handle = WebRtcVad_Create();
    WebRtcVad_Init(handle);
    WebRtcVad_set_mode(handle, 2);
    int window_size = 10;
    AudioWindow audiowindow(window_size);
    int status = UNTRIGGERED;
    int offset = 0;
    int fs = 16000;
    int step = 480;
    int start = 0;
    int speech_align_len = (int)(ceil((float)framesize / align_size) * align_size);
    float* buf;//[speech_align_len] = {0};
    float scale = 32768;
    bool found = false;
    int16_t *speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_align_len);

    memset(speech_buff, 0, sizeof(int16_t) * speech_align_len);

    buf = (float *)malloc(sizeof(float) * speech_align_len);
    memset(buf, 0, sizeof(float) * speech_align_len);

    for(unsigned int i=0; i<framesize; i++){
        buf[i] = (float)buffer[i] / scale;
        speech_buff[i] = buffer[i];
    }
    result_str = "";
    while (offset < (framesize - step)) {
            int n = WebRtcVad_Process(handle, fs, speech_buff + offset, step);
            if ((status == UNTRIGGERED) && (audiowindow.put(n) >= (window_size - 1))) {
                start = offset - step * (window_size - 1);
                status = TRIGGERED;
                found = false;
            } else if (status == TRIGGERED) {
                int win_weight = audiowindow.put(n);
                int voice_len = (offset - start);
                int gap = 0;
                if (voice_len < SPEECH_LEN_5S) {
                    offset += step;
                    continue;
                } else if (voice_len < SPEECH_LEN_10S) {
                    gap = 1;
                } else if (voice_len < SPEECH_LEN_20S) {
                    gap = window_size / 5;
                } else {
                    gap = window_size / 2;
                }

                if (win_weight < gap) {
                    status = UNTRIGGERED;
                    float num_samples = offset - start;
                    float frame_length = 400;
                    float frame_shift = 160;
                    float num_new_samples =
                            ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length;

                    offset = start + num_new_samples;
                    mm->reset();
                    string msg = mm->forward(buf+start, num_new_samples, 2);//this is voice recoged buffer range
                    //cout<<"do while, done recog"<<endl;
                    result_str += msg;
                    found = true;
                    //cout<<to_string(start)<<" found: " << msg;
                }
            }
            offset += step;
        }
    if (!found &&(status == TRIGGERED)){
        float num_samples = offset - start;
        float frame_length = 400;
        float frame_shift = 160;
        float num_new_samples =
                ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length;
        mm->reset();
        string msg = mm->forward(buf+start, num_new_samples, 2);//S_begin:0, s_middle:1, s_end:2.
        //cout <<"last: "<< msg;
        result_str += msg;
    }
    /*we need to strcat all returned msg and return of one buffer in*/
    free(buf);
    free(speech_buff);
    return result_str;
}

Cmake的配置很简单，链接fastasr fftw3f openblas webrtcvad几个库即可。但需要注意是需要将源码包的webrtc目录和Model.h拷贝到当前工程目录下，并在cmake添加add_subdirectory("./webrtc")。由于比较简单，上面源码对应的头文件就不贴了，可以自行书写。