材料准备
一块ESP32(本文所用为ESP32-S3-N16R8)
一块max98357A(功放模块)+一个喇叭
一块INMP441麦克风模块
一块5V锂电池
一块面包板
项目流程
细致分为十个步骤,如图
主要步骤为:
① 将麦克风录音的音频发送到百度语音智能云平台,调用百度语音识别技术,将音频转换为文字,识别的文字返回给ESP32
②ESP32将文字发送到minmax平台,调用minmax模型获取回答,并将回答以文字形式返回给ESP32
③ESP32将获取的回复内容发送到百度语音智能云平台,调用百度语音合成技术,将文字内容转换为音频,音频返回给ESP32,ESP32将合成的音频通过功放模块放出来
代码连线:
模块引脚说明:
ESP32 | MAX98357A |
16 | LRC |
15 | BCLK |
7 | DIN |
GND | GND |
3.3/5V | VIN |
ESP32 | INP441 |
4 | SCK |
5 | WS |
6 | SD |
3.3V | VDD |
GND | GND |
INP441与MAX98357A未提及部分不需要接线,MAX98357A还需接喇叭,注意正负极,红色为正,黑色为负
开发环境:Arduino IDE
开发板选取:ESP32S3 Dev Module
流程如下(默认已经有了ESP32环境)
如果说这个方法找不到开发板,还有一种方法,具体流程如下
实现代码:(代码有bug)
#include <WiFi.h>
#include <HTTPClient.h>
#include <ArduinoJson.h>
#include <driver/i2s.h>
#include <UrlEncode.h>
#include <base64.hpp>
#include <string.h>
#include "cJSON.h"
// I2S config for MAX98357A
#define I2S_OUT_PORT I2S_NUM_1
#define I2S_OUT_BCLK 15
#define I2S_OUT_LRC 16
#define I2S_OUT_DOUT 7
// INMP441 config
#define I2S_IN_PORT I2S_NUM_0
#define I2S_IN_BCLK 4
#define I2S_IN_LRC 5
#define I2S_IN_DIN 6
// WiFi credentials
const char* ssid = "name";
const char* password = "password";
// 2. Replace with your OpenAI API key
const char* apiKey = "*******************";
// Send request to OpenAI API
String apiUrl = "https://api.minimax.chat/v1/text/chatcompletion_v2";
const int STT_DEV_PID = 1537; //选填,输入法模型 1737-英语 1537-普通话(近场识别模型) 1936-普通话远程识别 1837-四川话
const char *STT_CUID = "*****************8"; //用户唯一标识,用来区分用户,计算UV值。建议填写能区分用户的机器 MAC 地址或 IMEI 码,长度为60字符以内。
const char *STT_CLIENT_ID = "*****************8"; //API Key
const char *STT_CLIENT_SECRET = "*******************"; //Secret Key
// Audio recording settings
#define SAMPLE_RATE 16000
#define RECORD_TIME_SECONDS 15
#define BUFFER_SIZE (SAMPLE_RATE * RECORD_TIME_SECONDS)
/** Audio buffers, pointers and selectors */
typedef struct {
int16_t* buffer;
uint8_t buf_ready;
uint32_t buf_count;
uint32_t n_samples;
} inference_t;
static inference_t inference;
static const uint32_t sample_buffer_size = 2048;
static signed short sampleBuffer[sample_buffer_size];
static bool debug_nn = false; // Set this to true to see e.g. features generated from the raw signal
static bool record_status = true;
String getAccessToken(const char* api_key, const char* secret_key) {
String access_token = "";
HTTPClient http;
// 创建http请求
http.begin("https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=" + String(api_key) + "&client_secret=" + String(secret_key));
int httpCode = http.POST("");
if (httpCode == HTTP_CODE_OK) {
String response = http.getString();
DynamicJsonDocument doc(1024);
deserializeJson(doc, response);
access_token = doc["access_token"].as<String>();
Serial.printf("[HTTP] GET access_token: %s\n", access_token);
} else {
Serial.printf("[HTTP] GET... failed, error: %s\n", http.errorToString(httpCode).c_str());
}
http.end();
return access_token;
}
void wifi_setup() {
WiFi.mode(WIFI_STA);
WiFi.begin(ssid, password);
Serial.print("Connecting to WiFi ..");
while (WiFi.status() != WL_CONNECTED) {
Serial.print('.');
delay(1000);
}
Serial.println(WiFi.localIP());
Serial.println("Enter a prompt:");
}
void baiduTTS_Send(String access_token, String text) {
if (access_token == "") {
Serial.println("access_token is null");
return;
}
if (text.length() == 0) {
Serial.println("text is null");
return;
}
const int per = 1;
const int spd = 6;
const int pit = 5;
const int vol = 15;
const int aue = 6;
// 进行 URL 编码
String encodedText = urlEncode(urlEncode(text));
// URL http请求数据封装
String url = "https://tsn.baidu.com/text2audio";
const char* header[] = { "Content-Type", "Content-Length" };
url += "?tok=" + access_token;
url += "&tex=" + encodedText;
url += "&per=" + String(per);
url += "&spd=" + String(spd);
url += "&pit=" + String(pit);
url += "&vol=" + String(vol);
url += "&aue=" + String(aue);
url += "&cuid=esp32s3";
url += "&lan=zh";
url += "&ctp=1";
// http请求创建
HTTPClient http;
http.begin(url);
http.collectHeaders(header, 2);
int httpResponseCode = http.GET();
if (httpResponseCode > 0) {
if (httpResponseCode == HTTP_CODE_OK) {
String contentType = http.header("Content-Type");
if (contentType.startsWith("audio")) {
Serial.println("合成成功");
uint8_t buffer[32] = {0}; // Increased buffer size for smoother playback
size_t bytesRead = 0;
int len = http.getSize(); // 读取响应正文数据字节数,如果返回-1是因为响应头中没有Content-Length属性
// 获取返回的音频数据流
WiFiClient * stream = http.getStreamPtr();
while (http.connected() && (len > 0 || len == -1)) // 当前已连接并且有数据可读
{
size_t size = stream->available(); // 获取数据流中可用字节数
if (size)
{
int c = stream->readBytes(buffer, ((size > sizeof(buffer)) ? sizeof(buffer) : size)); // 读取数据到buffer
playAudio(buffer, c);
if (len > 0)
{
len -= c;
}
}
delay(1);
}
// Flush and stop I2S after all data is processed
// 清空I2S DMA缓冲区
delay(200);
clearAudio();
} else if (contentType.equals("application/json")) {
Serial.println("合成出现错误");
String response = http.getString(); // Print the error JSON for debugging
Serial.println(response);
} else {
Serial.println("未知的Content-Type: " + contentType);
}
} else {
Serial.print("Error code: ");
Serial.println(httpResponseCode);
String response = http.getString(); // Print the error response for debugging
Serial.println(response);
}
} else {
Serial.print("Error code: ");
Serial.println(httpResponseCode);
}
http.end();
}
String getGPTAnswer(String inputText) {
HTTPClient http;
http.setTimeout(10000);
http.begin(apiUrl);
http.addHeader("Content-Type", "application/json");
String token_key = String("Bearer ") + apiKey;
http.addHeader("Authorization", token_key);
String payload = "{\"model\":\"abab5.5s-chat\",\"messages\":[{\"role\": \"system\",\"content\": \"要求下面的回答严格控制在256字符以内\"},{\"role\": \"user\",\"content\": \"" + inputText + "\"}]}";
int httpResponseCode = http.POST(payload);
if (httpResponseCode == 200) {
String response = http.getString();
http.end();
Serial.println(response);
// Parse JSON response
DynamicJsonDocument jsonDoc(1024);
deserializeJson(jsonDoc, response);
String outputText = jsonDoc["choices"][0]["message"]["content"];
return outputText;
// Serial.println(outputText);
} else {
http.end();
Serial.printf("Error %i \n", httpResponseCode);
return "<error>";
}
}
void setup() {
// 设置串口波特率
Serial.begin(115200);
// 设置LED输出模式,并初始化设置为低
pinMode(LED_BUILTIN, OUTPUT);
digitalWrite(LED_BUILTIN, LOW); //Turn off
// Connect to WiFi
wifi_setup();
// Initialize I2S for audio output
i2s_config_t i2s_config_out = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX),
.sample_rate = SAMPLE_RATE,
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
.channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT,
.communication_format = (i2s_comm_format_t)(I2S_COMM_FORMAT_STAND_I2S),
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 8,
.dma_buf_len = 1024,
};
i2s_pin_config_t pin_config = {
.bck_io_num = I2S_OUT_BCLK,
.ws_io_num = I2S_OUT_LRC,
.data_out_num = I2S_OUT_DOUT,
.data_in_num = -1
};
i2s_driver_install(I2S_OUT_PORT, &i2s_config_out, 0, NULL);
i2s_set_pin(I2S_OUT_PORT, &pin_config);
// Initialize I2S for audio input
i2s_config_t i2s_config_in = {
.mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX),
.sample_rate = SAMPLE_RATE,
.bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT, // 注意:INMP441 输出 32 位数据
.channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
.communication_format = i2s_comm_format_t(I2S_COMM_FORMAT_STAND_I2S),
.intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
.dma_buf_count = 8,
.dma_buf_len = 1024,
};
i2s_pin_config_t pin_config_in = {
.bck_io_num = I2S_IN_BCLK,
.ws_io_num = I2S_IN_LRC,
.data_out_num = -1,
.data_in_num = I2S_IN_DIN
};
i2s_driver_install(I2S_IN_PORT, &i2s_config_in, 0, NULL);
i2s_set_pin(I2S_IN_PORT, &pin_config_in);
}
/**
* @brief Arduino main function. Runs the inferencing loop.
*/
void loop() {
String baidu_access_token = "";
baidu_access_token = getAccessToken(STT_CLIENT_ID, STT_CLIENT_SECRET);
size_t bytes_read = 0, recordingSize = 0;
int16_t data[512];
// 分配内存
uint8_t* decode_data = (uint8_t*)ps_malloc(BUFFER_SIZE);
if (!decode_data) {
Serial.println("Failed to allocate memory for decode_data");
return;
}
uint8_t* pcm_data = (uint8_t*)ps_malloc(BUFFER_SIZE);
if (!pcm_data) {
Serial.println("Failed to allocate memory for pcm_data");
return;
}
while (1) {
// i2s录音
esp_err_t result = i2s_read(I2S_NUM_0, data, sizeof(data), &bytes_read, portMAX_DELAY);
// 打印采集的数据,用于调试
// for (int i = 0; i < bytes_read / 2; i++) {
// Serial.println(data[i]);
// }
// 放大音量
for (int i = 0; i < bytes_read / 2; i++) {
data[i] *= 20;
}
memcpy(pcm_data + recordingSize, data, bytes_read);
recordingSize += bytes_read;
// Serial.printf("%x recordingSize: %d bytes_read :%d\n", pcm_data + recordingSize, recordingSize, bytes_read);
if (recordingSize >= BUFFER_SIZE - bytes_read) {
break;
}
}
if (recordingSize > 0) {
// 音频转文本(语音识别API访问)
String recognizedText = baiduSTT_Send(baidu_access_token, pcm_data, recordingSize);
Serial.println("Recognized text: " + recognizedText);
String ernieResponse = getGPTAnswer(recognizedText);
Serial.println("Enter a prompt:");
Serial.println("Ernie Bot response: " + ernieResponse);
// 文本转音频tts并通过MAX98357A输出(语音合成API访问)
baiduTTS_Send(baidu_access_token, ernieResponse);
Serial.println("ttsSize: ");
}
// 释放内存
free(pcm_data);
delay(10);
}
static void audio_inference_callback(uint32_t n_bytes) {
for (int i = 0; i < n_bytes >> 1; i++) {
inference.buffer[inference.buf_count++] = sampleBuffer[i];
if (inference.buf_count >= inference.n_samples) {
inference.buf_count = 0;
inference.buf_ready = 1;
}
}
}
static void capture_samples(void* arg) {
const int32_t i2s_bytes_to_read = (uint32_t)arg;
size_t bytes_read = i2s_bytes_to_read;
while (1) {
if (record_status) {
/* read data at once from i2s - Modified for XIAO ESP2S3 Sense and I2S.h library */
i2s_read(I2S_IN_PORT, (void*)sampleBuffer, i2s_bytes_to_read, &bytes_read, 100);
// esp_i2s::i2s_read(esp_i2s::I2S_NUM_0, (void *)sampleBuffer, i2s_bytes_to_read, &bytes_read, 100);
if (bytes_read <= 0) {
Serial.println("Error in I2S read : %d");
} else {
if (bytes_read < i2s_bytes_to_read) {
Serial.println("Partial I2S read");
}
// scale the data (otherwise the sound is too quiet)
for (int x = 0; x < i2s_bytes_to_read / 2; x++) {
sampleBuffer[x] = (int16_t)(sampleBuffer[x]) * 8;
}
audio_inference_callback(i2s_bytes_to_read);
}
}
delay(1);
}
vTaskDelete(NULL);
}
// Get Baidu API access token
String baiduSTT_Send(String access_token, uint8_t* audioData, int audioDataSize) {
String recognizedText = "";
if (access_token == "") {
Serial.println("access_token is null");
return recognizedText;
}
// audio数据包许愿哦进行Base64编码,数据量会增大1/3
int audio_data_len = audioDataSize * sizeof(char) * 1.4;
unsigned char* audioDataBase64 = (unsigned char*)ps_malloc(audio_data_len);
if (!audioDataBase64) {
Serial.println("Failed to allocate memory for audioDataBase64");
return recognizedText;
}
// json包大小,由于需要将audioData数据进行Base64的编码,数据量会增大1/3
int data_json_len = audioDataSize * sizeof(char) * 1.4;
char* data_json = (char*)ps_malloc(data_json_len);
if (!data_json) {
Serial.println("Failed to allocate memory for data_json");
return recognizedText;
}
// Base64 encode audio data
encode_base64(audioData, audioDataSize, audioDataBase64);
memset(data_json, '\0', data_json_len);
strcat(data_json, "{");
strcat(data_json, "\"format\":\"pcm\",");
strcat(data_json, "\"rate\":16000,");
strcat(data_json, "\"dev_pid\":1537,");
strcat(data_json, "\"channel\":1,");
strcat(data_json, "\"cuid\":\"57722200\",");
strcat(data_json, "\"token\":\"");
strcat(data_json, access_token.c_str());
strcat(data_json, "\",");
sprintf(data_json + strlen(data_json), "\"len\":%d,", audioDataSize);
strcat(data_json, "\"speech\":\"");
strcat(data_json, (const char*)audioDataBase64);
strcat(data_json, "\"");
strcat(data_json, "}");
// 创建http请求
HTTPClient http_client;
http_client.begin("http://vop.baidu.com/server_api");
http_client.addHeader("Content-Type", "application/json");
int httpCode = http_client.POST(data_json);
if (httpCode > 0) {
if (httpCode == HTTP_CODE_OK) {
// 获取返回结果
String response = http_client.getString();
Serial.println(response);
// 从json中解析对应的result
DynamicJsonDocument responseDoc(2048);
deserializeJson(responseDoc, response);
recognizedText = responseDoc["result"].as<String>();
}
} else {
Serial.printf("[HTTP] POST failed, error: %s\n", http_client.errorToString(httpCode).c_str());
}
// 释放内存
if (audioDataBase64) {
free(audioDataBase64);
}
if (data_json) {
free(data_json);
}
http_client.end();
Serial.println(recognizedText);
return recognizedText;
}
// Play audio data using MAX98357A
void playAudio(uint8_t* audioData, size_t audioDataSize) {
if (audioDataSize > 0) {
// 发送
size_t bytes_written = 0;
i2s_write(I2S_OUT_PORT, (int16_t*)audioData, audioDataSize, &bytes_written, portMAX_DELAY);
}
}
void clearAudio(void) {
// 清空I2S DMA缓冲区
i2s_zero_dma_buffer(I2S_OUT_PORT);
Serial.print("clearAudio");
}
代码需要修改部分:
①WIFI名称、密码
②minmax大模型 API Key(API Key很长很长)
③百度智能云语音技术申请的ID、API Key、Secret Key
目录
②和③都可以在相应网站申请,使用教程也有详细文档(百度智能云平台操作有点麻烦,耐心一点)
代码所存在的问题:
①程序目前录音没有问题,很灵敏,但是获取回答会有延迟,延迟较高15-30秒不等
②回答的语音存在卡顿,重复问题
③minmax模型回答不准确,比如你问今天什么时候,它的回答是几年前的时间
④目前代码还不够规范,还未模块化
代码优化解决方案思路:
①对语音转文字和文字转语音部分封装成两个线程(我尝试过,但是效果还是不理想)
②采用按键方式,比如按下按键就录音,松开按键就停止录音,获取回答(网上大部分教程都有按键,我想可能也是解决延迟卡顿问题,但是不符合我的预期就没做)
③语音转文字和获取回答部分都没有问题(除了延迟),那我曾用离线语音模块替换文字转语音部分,但是声音非常AI(离线语音模块选取:DFRobot Gravity:离线中英文语音合成模块V2)
项目过程中遇到的其他问题:
①关于库:
#include <UrlEncode.h>
#include <base64.hpp>
这两个库主要是对文本内容进行URL编码,版本很多,但是我使用下来只有这两个能用
②关于麦克风和功放模块引脚,我也有测试很多,但是都没有声音,也是只有目前这几个引脚使用下来是成功驱动的