前言
在jetson上搭建一个AI语音家居助手,主要需要四个部分的工作:
- 1️⃣获取麦克风输入,语音激活功能+语音转文字功能
- 2️⃣部署大语言模型理解转换后的文字输入 (本地推理或云推理)
- 3️⃣提取大模型推理文字结果后,执行文字转语音功能,播放推理结果
- 4️⃣根据推理结果,选择动作执行(控制家居设备)
本文主要完成1️⃣的搭建,2️⃣的搭建可以笔者之前的文章:
https://blog.youkuaiyun.com/Vingnir/article/details/149119735?spm=1001.2014.3001.5502
该部分功能的参考源码放在github仓库:
硬件构成
该部分主要有三个硬件构成;
Jetson主控- Orin NX 16g super,其他Orin系列产品也可。
麦克风- 四通道,单通道和双通道也可以,用于获取用户语音输入。
扬声器-USB接口或者其他接口都行,可以播放声音就行。
本人使用硬件如下:
主控:https://www.seeedstudio.com/reComputer-Super-J4012-p-6443.html
麦克风:https://www.seeedstudio.com/ReSpeaker-4-Mic-Array-for-Raspberry-Pi.htm种编译好的wav2text
程序:
pasuspender -- sudo ./wav2text
然后运行record_lite
:
pasuspender -- sudo ./record_lite
运行的时候增加pasuspender
选项是因为需要挂起PulseAudio,以便我们的程序后端可以直接访问ALSA设备,而不会被PulseAudio占用,避免音频设备冲突问题。
运行结果如下:
record_lite
进程检测到声音能量大于阈值后开始录音,没有输入超过mute的时间阈值后结束录音:
wav2text
完成语音转文字的请求,输出语音转文字结果,并进行语音激活检测,激活默认播放activate.mp3
,可以替换为你想触发激活响应语音:
whisper-server
完成语音转文字的推理请求,并把推理结果发送给wav2text
:
总结
至此,我们完成了AI语音家居助手搭建的第二个模块,结合本地部署的大语言模型或云端的大语言模型,就实现一个雏形,剩下的工作就是action部分,并把大模型的输出整理后转为语音输出增加互动性。
附录
这里把主要的源码贴一下,如果访问不了github可以直接从这里复制:
------------respeaker.cpp------------⬇
#include <alsa/asoundlib.h>
#include <iostream>
#include <fstream>
#include <string>
#include <chrono>
#include <thread>
#define SAMPLE_RATE 44100
#define BIT_PER_SAMPLE 16
#define CHANNELS 2
#define FORMAT SND_PCM_FORMAT_S16_LE
#define RECORD_MS 10000 //最大录音时长
#define SLIENCE_MS 2000 //讲话结束静音时长判断阈值
#define DEVICE_NAME "plughw:2,0" // 改为你的麦克风 device,hw:2,0兼容性不好,建议不要用
typedef int16_t bit_per;
#include "voice_gate.hpp"
std::ofstream wavFile;
size_t totalBytes = 0;
unsigned int sample_rate = SAMPLE_RATE;
int dir;
snd_pcm_format_t format;
unsigned int channels = 4;
void write_wav_header(std::ofstream &file, int sample_rate, int num_channels, int bits_per_sample) {
char header[44] = {0};
int16_t audio_format = 1; // PCM
int32_t byte_rate = sample_rate * num_channels * bits_per_sample / 8;
int16_t block_align = num_channels * bits_per_sample / 8;
// RIFF chunk
memcpy(header, "RIFF", 4);
*(int32_t*)(header + 4) = 0; // Placeholder for file size
memcpy(header + 8, "WAVE", 4);
// fmt subchunk
memcpy(header + 12, "fmt ", 4);
*(int32_t*)(header + 16) = 16; // Subchunk1Size for PCM
*(int16_t*)(header + 20) = audio_format;
*(int16_t*)(header + 22) = num_channels;
*(int32_t*)(header + 24) = sample_rate;
*(int32_t*)(header + 28) = byte_rate;
*(int16_t*)(header + 32) = block_align;
*(int16_t*)(header + 34) = bits_per_sample;
// data subchunk
memcpy(header + 36, "data", 4);
*(int32_t*)(header + 40) = 0; // Placeholder for data size
file.write(header, 44);
}
void finalize_wav(std::ofstream &file, size_t dataBytes) {
file.seekp(4, std::ios::beg);
int32_t fileSize = 36 + dataBytes;
file.write(reinterpret_cast<char*>(&fileSize), 4);
file.seekp(40, std::ios::beg);
file.write(reinterpret_cast<char*>(&dataBytes), 4);
}
int main() {
snd_pcm_t *pcm_handle;
snd_pcm_hw_params_t *params;
int dir;
int rc = -1;
rc = snd_pcm_open(&pcm_handle, DEVICE_NAME, SND_PCM_STREAM_CAPTURE, 0);
if (rc < 0) {
std::cerr << "Unable to open PCM device: " << snd_strerror(rc) << std::endl;
return 1;
}
// 分配参数结构
snd_pcm_hw_params_malloc(¶ms);
snd_pcm_hw_params_any(pcm_handle, params);
// 获取采样率
snd_pcm_hw_params_get_rate(params, &sample_rate, &dir);
std::cout << "采样率: " << sample_rate << " Hz" << std::endl;
// 获取通道数
snd_pcm_hw_params_get_channels(params, &channels);
std::cout << "通道数: " << channels << std::endl;
// 获取音频格式
snd_pcm_hw_params_get_format(params, &format);
std::cout << "格式: " << snd_pcm_format_name(format)
<< " (" << snd_pcm_format_description(format) << ")" << std::endl;
//config the PCM_handle
snd_pcm_hw_params_set_access(pcm_handle, params, SND_PCM_ACCESS_RW_INTERLEAVED);
snd_pcm_hw_params_set_format(pcm_handle, params, FORMAT);
snd_pcm_hw_params_set_channels(pcm_handle, params, CHANNELS);
snd_pcm_hw_params_set_rate(pcm_handle, params, sample_rate, 0);
snd_pcm_hw_params(pcm_handle, params);
snd_pcm_hw_params_free(params);
snd_pcm_prepare(pcm_handle);
int buffer_frames = 1024 * 2; //
bit_per* buffer = new bit_per[buffer_frames](); //强制零初始化
int buffer_l_frames = 1024 ; //
bit_per* buffer_l = new bit_per[buffer_l_frames * CHANNELS](); //强制零初始化
//int16_t* buffer = (int16_t*) aligned_alloc(16, buffer_frames * sizeof(int16_t));
//memset(buffer, 0, buffer_frames * sizeof(int16_t));
std::string filename = "respeaker_output.wav";
//实例化输入声响阈值
VoiceGate gate(/*energyThresh*/ 500, //单帧能量阈值
/*silenceMs */ RECORD_MS,
/*sampleRate */ SAMPLE_RATE,
/*frameSamples*/ buffer_l_frames * CHANNELS);
size_t total_samples = SAMPLE_RATE * RECORD_MS/1000;
size_t recorded_samples = 0;
while(true)
{
rc = snd_pcm_readi(pcm_handle, buffer_l, buffer_l_frames);
if (rc < 0) continue;
if (rc > buffer_l_frames) {
std::cerr << "rc = " << rc << " exceeds buffer_frames = " << buffer_frames << std::endl;
continue;
}
if(rc >0 ){
//printf("Read %d frames from PCM device.\n", rc);
gate.update(buffer_l, rc * CHANNELS);
}
bool enable = false;
if(gate.startSegment()) {
std::cout << "Start recording segment...٩(๑>◡<๑)۶" << std::endl;
if (wavFile.is_open()) wavFile.close();
wavFile.open(filename, std::ios::binary);
write_wav_header(wavFile, SAMPLE_RATE, CHANNELS, BIT_PER_SAMPLE); // 32-bit PCM
wavFile.write(reinterpret_cast<char*>(buffer_l), rc * CHANNELS * sizeof(bit_per));
recorded_samples += rc;
totalBytes += rc * sizeof(bit_per);
while((recorded_samples < total_samples)&&(gate.stopSegment() == false)){
//printf("breakpoint\n" );
if(enable == true){
rc = snd_pcm_readi(pcm_handle, buffer_l, buffer_l_frames);
}
enable = true;
gate.update(buffer_l, rc * CHANNELS);
if (rc == -EPIPE) {
std::cerr << "Overrun occurred" << std::endl;
snd_pcm_prepare(pcm_handle);
continue;
} else if (rc < 0) {
std::cerr << "Error reading: " << snd_strerror(rc) << std::endl;
break;
}
//wavFile.write((char *)buffer_l, rc * CHANNELS * sizeof(bit_per));
wavFile.write(reinterpret_cast<char*>(buffer_l), rc * CHANNELS * sizeof(bit_per));
recorded_samples += rc;
totalBytes += rc * CHANNELS * sizeof(bit_per);
}
std::cout << "Recording is over..(>^ω^<)" << std::endl;
recorded_samples = 0;
finalize_wav(wavFile, totalBytes);
wavFile.close();
//free(buffer);
totalBytes = 0;
}
}
snd_pcm_drain(pcm_handle);
snd_pcm_close(pcm_handle);
delete[] buffer;
delete[] buffer_l;
std::cout << "Saved to " << filename << std::endl;
return 0;
}
------------voice_gate.hpp------------⬇
#ifndef VOICE_GATE_HPP
#define VOICE_GATE_HPP
#include <stdio.h>
#include <cstdint>
#include <cmath>
typedef int16_t bit_per; // feed one frame
class VoiceGate {
public:
VoiceGate(double energyThresh, // e.g. 800
int silenceMs, // e.g. 1000
int sampleRate, // e.g. 16000
int frameSamples) // e.g.
: THRESH(energyThresh),
SILENCE_LIMIT_FRAMES((silenceMs * sampleRate) / (1000 * frameSamples)),
samplesPerFrame(frameSamples) {}
void update(const bit_per* data, int len) {
double e = rmsEnergy(data, len);
double log_sound = e;
//printf("this frame e is %lf\n",log_sound);
if (state == Standby) {
if (e > THRESH) { // speech starts
state = Recording;
silenceCtr = 0;
started = true;
stopped = false;
}
} else { // Recording
if (e > THRESH) {
re_mute = true;
silenceCtr = 0; // still speech
} else {
if(re_mute = true){
start_time = std::chrono::high_resolution_clock::now();
re_mute = false;
}
++silenceCtr;
if((silenceCtr > SILENCE_LIMIT_FRAMES)){
state = Standby; // speech ended
started =false;
stopped = true;
auto over_time = std::chrono::high_resolution_clock::now();
std::chrono::duration<double, std::milli> duration_ms = over_time - start_time;
//std::cout << "mute time: " << duration_ms.count() << " 毫秒" << std::endl;
}
}
}
}
bool isRecording() const { return state == Recording; }
bool startSegment() { bool f = started; started = false; return f; }
bool stopSegment() { bool f = stopped; return f; }
private:
enum { Standby, Recording } state = Standby;
const float THRESH;
const int SILENCE_LIMIT_FRAMES;
const int samplesPerFrame;
int silenceCtr = 0;
bool started = false;
bool stopped = false;
bool re_mute = true;
std::chrono::time_point<std::chrono::_V2::system_clock, std::chrono::_V2::system_clock::duration> start_time;
double rmsEnergy(const bit_per* buf, int len) const {
double sum = 0.0f;
bit_per tmp_data = 0;
len > (samplesPerFrame)? len = (samplesPerFrame) : len = len;
for (int i = 0; i < len; ++i){
if (buf[i] < -2147483648 || buf[i] > 2147483647) {
fprintf(stderr, "Error: Sample value out of range: %d\n", buf[i]);
return 0.0; // Return 0 if sample is out of range
}
tmp_data = abs(buf[i]); // Ensure the value is non-negative
sum += tmp_data;
//printf("buf[%d] is %d\n", i, buf[i]);
}
double ave = sum / len;
return ave;
}
};
#endif
------------shared_state.h------------⬇
// shared_state.h
#ifndef SHARED_STATE_H
#define SHARED_STATE_H
enum class VoiceState {
RECORDED,
RECORDING,
DECODE,
PLAYING,
RESTART,
INIT
};
#endif
------------wav2text.cpp------------⬇
#include <iostream>
#include <fstream>
#include <unistd.h>
#include <string>
#include <fcntl.h>
#include <sys/stat.h> // for mkfifo
#include <sys/types.h> // for mode_t
#include <string>
#include <chrono>
#include <thread>
#include <curl/curl.h>
#include <nlohmann/json.hpp>
#include "shared_state.h"
using json = nlohmann::json;
// 用于保存返回数据
static size_t WriteCallback(void *contents, size_t size, size_t nmemb, void *userp) {
((std::string*)userp)->append((char*)contents, size * nmemb);
return size * nmemb;
}
std::string transcribe_audio(const std::string& filepath) {
CURL *curl;
CURLcode res;
std::string readBuffer;
curl_mime *form = nullptr;
curl_mimepart *field = nullptr;
curl_global_init(CURL_GLOBAL_ALL);
curl = curl_easy_init();
if(curl) {
form = curl_mime_init(curl);
// 添加 WAV 文件
field = curl_mime_addpart(form);
curl_mime_name(field, "file");
curl_mime_filedata(field, filepath.c_str());
// 设置 response_format = json
field = curl_mime_addpart(form);
curl_mime_name(field, "response_format");
curl_mime_data(field, "json", CURL_ZERO_TERMINATED);
curl_easy_setopt(curl, CURLOPT_URL, "http://127.0.0.1:8080/inference");
curl_easy_setopt(curl, CURLOPT_MIMEPOST, form);
// 设置回调函数保存返回内容
curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteCallback);
curl_easy_setopt(curl, CURLOPT_WRITEDATA, &readBuffer);
// 执行请求
res = curl_easy_perform(curl);
// 检查错误
if(res != CURLE_OK)
std::cerr << "curl_easy_perform() failed: " << curl_easy_strerror(res) << std::endl;
curl_mime_free(form);
curl_easy_cleanup(curl);
}
curl_global_cleanup();
return readBuffer;
}
bool contains_keywords(const std::string& text, const std::vector<std::string>& keywords) {
for (const auto& keyword : keywords) {
if (text.find(keyword) != std::string::npos) {
return true; // Keyword found
}
}
return false; // No keyword found
}
int main() {
mkfifo("/tmp/state_pipe2", 0666); // 创建命名管道
int pipe_fd = open("/tmp/state_pipe", O_RDONLY | O_NONBLOCK);
int pipe_ba = open("/tmp/state_pipe2", O_WRONLY | O_NONBLOCK );
VoiceState state = VoiceState::INIT;
VoiceState state_ba = VoiceState::INIT;
while(1){
read(pipe_fd, &state, sizeof(state));
if(state == VoiceState::RECORDED){
printf("breakpoint\n");
state_ba = VoiceState::DECODE;
write(pipe_ba, &state_ba, sizeof(state_ba));
std::string result_json = transcribe_audio("respeaker_output.wav");
std::cout << "Whisper 推理结果:\n" << result_json << std::endl;
// Parse JSON
nlohmann::json j = nlohmann::json::parse(result_json);
std::string text = j["text"];
// Define multiple keywords
std::vector<std::string> keywords = {"hello", "Hello", "super", "Super"};
//delete respeaker_output.wav
// Check if any of the keywords are found in the text
if (contains_keywords(text, keywords)) {
std::cout << "One or more keywords matched!" << std::endl;
const std::string device = "plughw:0,3"; // 对应 card 0, device 3
const std::string file = "activate.mp3";
std::string cmd = "mpg123 -a " + device + " \"" + file + "\"";
std::cout << "Executing: " << cmd << std::endl;
int ret = system(cmd.c_str());
if (ret != 0) {
std::cerr << "mpg123 playback failed!" << std::endl;
}
state_ba = VoiceState::RESTART;
write(pipe_ba, &state_ba, sizeof(state_ba));
} else {
std::cout << "No keywords matched." << std::endl;
state_ba = VoiceState::RESTART;
write(pipe_ba, &state_ba, sizeof(state_ba));
}
}
}
close(pipe_fd);
close(pipe_ba);
return 0;
}