ESP32实现AI问答,调用minmax大模型+百度语音

最新推荐文章于 2025-07-28 00:00:26 发布

monkey_fighting

最新推荐文章于 2025-07-28 00:00:26 发布

阅读量1k

点赞数 13

CC 4.0 BY-SA版权

文章标签：人工智能百度

本文链接：https://blog.youkuaiyun.com/monkey_fighting/article/details/148688482

材料准备

一块ESP32（本文所用为ESP32-S3-N16R8）

一块max98357A（功放模块）+一个喇叭

一块INMP441麦克风模块

一块5V锂电池

一块面包板

项目流程

细致分为十个步骤，如图

主要步骤为：

① 将麦克风录音的音频发送到百度语音智能云平台，调用百度语音识别技术，将音频转换为文字，识别的文字返回给ESP32

②ESP32将文字发送到minmax平台，调用minmax模型获取回答，并将回答以文字形式返回给ESP32

③ESP32将获取的回复内容发送到百度语音智能云平台，调用百度语音合成技术，将文字内容转换为音频，音频返回给ESP32,ESP32将合成的音频通过功放模块放出来

代码连线：

模块引脚说明：

ESP32	MAX98357A
16	LRC
15	BCLK
7	DIN
GND	GND
3.3/5V	VIN

ESP32	INP441
4	SCK
5	WS
6	SD
3.3V	VDD
GND	GND

INP441与MAX98357A未提及部分不需要接线，MAX98357A还需接喇叭，注意正负极，红色为正，黑色为负

开发环境：Arduino IDE

开发板选取：ESP32S3 Dev Module

流程如下（默认已经有了ESP32环境）

如果说这个方法找不到开发板，还有一种方法，具体流程如下

实现代码：（代码有bug）

#include <WiFi.h>
#include <HTTPClient.h>
#include <ArduinoJson.h>
#include <driver/i2s.h>
#include <UrlEncode.h>
#include <base64.hpp>
#include <string.h>
#include "cJSON.h"

// I2S config for MAX98357A
#define I2S_OUT_PORT I2S_NUM_1
#define I2S_OUT_BCLK 15
#define I2S_OUT_LRC 16
#define I2S_OUT_DOUT 7

// INMP441 config
#define I2S_IN_PORT I2S_NUM_0
#define I2S_IN_BCLK 4
#define I2S_IN_LRC 5
#define I2S_IN_DIN 6

// WiFi credentials
const char* ssid = "name";
const char* password = "password";

// 2. Replace with your OpenAI API key
const char* apiKey = "*******************";
// Send request to OpenAI API
String apiUrl = "https://api.minimax.chat/v1/text/chatcompletion_v2";

const int STT_DEV_PID = 1537; //选填，输入法模型 1737-英语 1537-普通话(近场识别模型) 1936-普通话远程识别 1837-四川话 
const char *STT_CUID = "*****************8"; //用户唯一标识，用来区分用户，计算UV值。建议填写能区分用户的机器 MAC 地址或 IMEI 码，长度为60字符以内。
const char *STT_CLIENT_ID = "*****************8"; //API Key
const char *STT_CLIENT_SECRET = "*******************"; //Secret Key

// Audio recording settings
#define SAMPLE_RATE 16000
#define RECORD_TIME_SECONDS 15
#define BUFFER_SIZE (SAMPLE_RATE * RECORD_TIME_SECONDS)

/** Audio buffers, pointers and selectors */
typedef struct {
  int16_t* buffer;
  uint8_t buf_ready;
  uint32_t buf_count;
  uint32_t n_samples;
} inference_t;

static inference_t inference;
static const uint32_t sample_buffer_size = 2048;
static signed short sampleBuffer[sample_buffer_size];
static bool debug_nn = false;  // Set this to true to see e.g. features generated from the raw signal
static bool record_status = true;

String getAccessToken(const char* api_key, const char* secret_key) {
  String access_token = "";
  HTTPClient http;

  // 创建http请求
  http.begin("https://aip.baidubce.com/oauth/2.0/token?grant_type=client_credentials&client_id=" + String(api_key) + "&client_secret=" + String(secret_key));
  int httpCode = http.POST("");

  if (httpCode == HTTP_CODE_OK) {
    String response = http.getString();
    DynamicJsonDocument doc(1024);
    deserializeJson(doc, response);
    access_token = doc["access_token"].as<String>();

    Serial.printf("[HTTP] GET access_token: %s\n", access_token);
  } else {
    Serial.printf("[HTTP] GET... failed, error: %s\n", http.errorToString(httpCode).c_str());
  }
  http.end();

  return access_token;
}

void wifi_setup() {
  WiFi.mode(WIFI_STA);
  WiFi.begin(ssid, password);
  Serial.print("Connecting to WiFi ..");
  while (WiFi.status() != WL_CONNECTED) {
    Serial.print('.');
    delay(1000);
  }
  Serial.println(WiFi.localIP());
  Serial.println("Enter a prompt:");
}

void baiduTTS_Send(String access_token, String text) {
  if (access_token == "") {
    Serial.println("access_token is null");
    return;
  }

  if (text.length() == 0) {
    Serial.println("text is null");
    return;
  }

  const int per = 1;
  const int spd = 6;
  const int pit = 5;
  const int vol = 15;
  const int aue = 6;

  // 进行 URL 编码
  String encodedText = urlEncode(urlEncode(text));

  // URL http请求数据封装
  String url = "https://tsn.baidu.com/text2audio";

  const char* header[] = { "Content-Type", "Content-Length" };

  url += "?tok=" + access_token;
  url += "&tex=" + encodedText;
  url += "&per=" + String(per);
  url += "&spd=" + String(spd);
  url += "&pit=" + String(pit);
  url += "&vol=" + String(vol);
  url += "&aue=" + String(aue);
  url += "&cuid=esp32s3";
  url += "&lan=zh";
  url += "&ctp=1";

  // http请求创建
  HTTPClient http;

  http.begin(url);
  http.collectHeaders(header, 2);

  int httpResponseCode = http.GET();
  if (httpResponseCode > 0) {
    if (httpResponseCode == HTTP_CODE_OK) {
      String contentType = http.header("Content-Type");
      if (contentType.startsWith("audio")) {
        Serial.println("合成成功");

        uint8_t buffer[32] = {0}; // Increased buffer size for smoother playback
        size_t bytesRead = 0;

        int len = http.getSize(); // 读取响应正文数据字节数，如果返回-1是因为响应头中没有Content-Length属性
        // 获取返回的音频数据流
        WiFiClient * stream = http.getStreamPtr();

        while (http.connected() && (len > 0 || len == -1)) // 当前已连接并且有数据可读
        {
          size_t size = stream->available(); // 获取数据流中可用字节数
          if (size)
          {
            int c = stream->readBytes(buffer, ((size > sizeof(buffer)) ? sizeof(buffer) : size)); // 读取数据到buffer
            playAudio(buffer, c);
            if (len > 0)
            {
              len -= c;
            }
          }
          delay(1);
        }

        // Flush and stop I2S after all data is processed
        // 清空I2S DMA缓冲区
        delay(200);
        clearAudio();
      } else if (contentType.equals("application/json")) {
        Serial.println("合成出现错误");
        String response = http.getString(); // Print the error JSON for debugging
        Serial.println(response);

      } else {
        Serial.println("未知的Content-Type: " + contentType);
      }
    } else {
      Serial.print("Error code: ");
      Serial.println(httpResponseCode);
      String response = http.getString(); // Print the error response for debugging
      Serial.println(response);
    }
  } else {
    Serial.print("Error code: ");
    Serial.println(httpResponseCode);
  }
  http.end();
}

String getGPTAnswer(String inputText) {
  HTTPClient http;
  http.setTimeout(10000);
  http.begin(apiUrl);
  http.addHeader("Content-Type", "application/json");
  String token_key = String("Bearer ") + apiKey;
  http.addHeader("Authorization", token_key);
  String payload = "{\"model\":\"abab5.5s-chat\",\"messages\":[{\"role\": \"system\",\"content\": \"要求下面的回答严格控制在256字符以内\"},{\"role\": \"user\",\"content\": \"" + inputText + "\"}]}";
  int httpResponseCode = http.POST(payload);
  if (httpResponseCode == 200) {
    String response = http.getString();
    http.end();
    Serial.println(response);

    // Parse JSON response
    DynamicJsonDocument jsonDoc(1024);
    deserializeJson(jsonDoc, response);
    String outputText = jsonDoc["choices"][0]["message"]["content"];
    return outputText;
    // Serial.println(outputText);
  } else {
    http.end();
    Serial.printf("Error %i \n", httpResponseCode);
    return "<error>";
  }
}

void setup() {
  // 设置串口波特率
  Serial.begin(115200);

  // 设置LED输出模式，并初始化设置为低
  pinMode(LED_BUILTIN, OUTPUT);
  digitalWrite(LED_BUILTIN, LOW);  //Turn off

  // Connect to WiFi
wifi_setup();

  // Initialize I2S for audio output
  i2s_config_t i2s_config_out = {
    .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_TX),
    .sample_rate = SAMPLE_RATE,
    .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,
    .channel_format = I2S_CHANNEL_FMT_ONLY_RIGHT,
    .communication_format = (i2s_comm_format_t)(I2S_COMM_FORMAT_STAND_I2S),
    .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
    .dma_buf_count = 8,
    .dma_buf_len = 1024,
  };
  i2s_pin_config_t pin_config = {
    .bck_io_num = I2S_OUT_BCLK,
    .ws_io_num = I2S_OUT_LRC,
    .data_out_num = I2S_OUT_DOUT,
    .data_in_num = -1
  };
  i2s_driver_install(I2S_OUT_PORT, &i2s_config_out, 0, NULL);
  i2s_set_pin(I2S_OUT_PORT, &pin_config);

  // Initialize I2S for audio input
  i2s_config_t i2s_config_in = {
    .mode = (i2s_mode_t)(I2S_MODE_MASTER | I2S_MODE_RX),
    .sample_rate = SAMPLE_RATE,
    .bits_per_sample = I2S_BITS_PER_SAMPLE_16BIT,  // 注意：INMP441 输出 32 位数据
    .channel_format = I2S_CHANNEL_FMT_ONLY_LEFT,
    .communication_format = i2s_comm_format_t(I2S_COMM_FORMAT_STAND_I2S),
    .intr_alloc_flags = ESP_INTR_FLAG_LEVEL1,
    .dma_buf_count = 8,
    .dma_buf_len = 1024,
  };
  i2s_pin_config_t pin_config_in = {
    .bck_io_num = I2S_IN_BCLK,
    .ws_io_num = I2S_IN_LRC,
    .data_out_num = -1,
    .data_in_num = I2S_IN_DIN
  };
  i2s_driver_install(I2S_IN_PORT, &i2s_config_in, 0, NULL);
  i2s_set_pin(I2S_IN_PORT, &pin_config_in);


}

/**
 * @brief      Arduino main function. Runs the inferencing loop.
 */
void loop() {


  String baidu_access_token = "";

  baidu_access_token = getAccessToken(STT_CLIENT_ID, STT_CLIENT_SECRET);

    size_t bytes_read = 0, recordingSize = 0;
  int16_t data[512];

  // 分配内存
  uint8_t* decode_data = (uint8_t*)ps_malloc(BUFFER_SIZE);
  if (!decode_data) {
    Serial.println("Failed to allocate memory for decode_data");
    return;
  }

  uint8_t* pcm_data = (uint8_t*)ps_malloc(BUFFER_SIZE);
  if (!pcm_data) {
    Serial.println("Failed to allocate memory for pcm_data");
    return;
  }


  while (1) {
    // i2s录音
    esp_err_t result = i2s_read(I2S_NUM_0, data, sizeof(data), &bytes_read, portMAX_DELAY);

    // 打印采集的数据，用于调试
    // for (int i = 0; i < bytes_read / 2; i++) {
    //   Serial.println(data[i]);
    // }

    // 放大音量
    for (int i = 0; i < bytes_read / 2; i++) {
        data[i] *= 20;
    }

    memcpy(pcm_data + recordingSize, data, bytes_read);
    recordingSize += bytes_read;
    // Serial.printf("%x recordingSize: %d bytes_read :%d\n", pcm_data + recordingSize, recordingSize, bytes_read);

    if (recordingSize >= BUFFER_SIZE - bytes_read) {
      break;
    }
  }

      if (recordingSize > 0) {
        // 音频转文本（语音识别API访问）
        String recognizedText = baiduSTT_Send(baidu_access_token, pcm_data, recordingSize);
        Serial.println("Recognized text: " + recognizedText);

      
       String ernieResponse = getGPTAnswer(recognizedText);
    Serial.println("Enter a prompt:");
        Serial.println("Ernie Bot response: " + ernieResponse);

        // 文本转音频tts并通过MAX98357A输出（语音合成API访问）
        baiduTTS_Send(baidu_access_token, ernieResponse);
        Serial.println("ttsSize: ");
      
      }

      // 释放内存
      free(pcm_data);

      delay(10);
    }
 
static void audio_inference_callback(uint32_t n_bytes) {
  for (int i = 0; i < n_bytes >> 1; i++) {
    inference.buffer[inference.buf_count++] = sampleBuffer[i];

    if (inference.buf_count >= inference.n_samples) {
      inference.buf_count = 0;
      inference.buf_ready = 1;
    }
  }
}

static void capture_samples(void* arg) {

  const int32_t i2s_bytes_to_read = (uint32_t)arg;
  size_t bytes_read = i2s_bytes_to_read;

  while (1) {
    if (record_status) {
      /* read data at once from i2s - Modified for XIAO ESP2S3 Sense and I2S.h library */
      i2s_read(I2S_IN_PORT, (void*)sampleBuffer, i2s_bytes_to_read, &bytes_read, 100);
      // esp_i2s::i2s_read(esp_i2s::I2S_NUM_0, (void *)sampleBuffer, i2s_bytes_to_read, &bytes_read, 100);

      if (bytes_read <= 0) {
        Serial.println("Error in I2S read : %d");
      } else {
        if (bytes_read < i2s_bytes_to_read) {
          Serial.println("Partial I2S read");
        }

        // scale the data (otherwise the sound is too quiet)
        for (int x = 0; x < i2s_bytes_to_read / 2; x++) {
          sampleBuffer[x] = (int16_t)(sampleBuffer[x]) * 8;
        }

        audio_inference_callback(i2s_bytes_to_read);
      }
    }
    delay(1);
  }
  vTaskDelete(NULL);
}

// Get Baidu API access token

String baiduSTT_Send(String access_token, uint8_t* audioData, int audioDataSize) {
  String recognizedText = "";

  if (access_token == "") {
    Serial.println("access_token is null");
    return recognizedText;
  }

  // audio数据包许愿哦进行Base64编码，数据量会增大1/3
  int audio_data_len = audioDataSize * sizeof(char) * 1.4;
  unsigned char* audioDataBase64 = (unsigned char*)ps_malloc(audio_data_len);
  if (!audioDataBase64) {
    Serial.println("Failed to allocate memory for audioDataBase64");
    return recognizedText;
  }

  // json包大小，由于需要将audioData数据进行Base64的编码，数据量会增大1/3
  int data_json_len = audioDataSize * sizeof(char) * 1.4;
  char* data_json = (char*)ps_malloc(data_json_len);
  if (!data_json) {
    Serial.println("Failed to allocate memory for data_json");
    return recognizedText;
  }

  // Base64 encode audio data
  encode_base64(audioData, audioDataSize, audioDataBase64);

  memset(data_json, '\0', data_json_len);
  strcat(data_json, "{");
  strcat(data_json, "\"format\":\"pcm\",");
  strcat(data_json, "\"rate\":16000,");
  strcat(data_json, "\"dev_pid\":1537,");
  strcat(data_json, "\"channel\":1,");
  strcat(data_json, "\"cuid\":\"57722200\",");
  strcat(data_json, "\"token\":\"");
  strcat(data_json, access_token.c_str());
  strcat(data_json, "\",");
  sprintf(data_json + strlen(data_json), "\"len\":%d,", audioDataSize);
  strcat(data_json, "\"speech\":\"");
  strcat(data_json, (const char*)audioDataBase64);
  strcat(data_json, "\"");
  strcat(data_json, "}");

  // 创建http请求
  HTTPClient http_client;

  http_client.begin("http://vop.baidu.com/server_api");
  http_client.addHeader("Content-Type", "application/json");
  int httpCode = http_client.POST(data_json);

  if (httpCode > 0) {
    if (httpCode == HTTP_CODE_OK) {
      // 获取返回结果
      String response = http_client.getString();
      Serial.println(response);

      // 从json中解析对应的result
      DynamicJsonDocument responseDoc(2048);
      deserializeJson(responseDoc, response);
      recognizedText = responseDoc["result"].as<String>();
    }
  } else {
    Serial.printf("[HTTP] POST failed, error: %s\n", http_client.errorToString(httpCode).c_str());
  }

  // 释放内存
  if (audioDataBase64) {
    free(audioDataBase64);
  }

  if (data_json) {
    free(data_json);
  }

  http_client.end();
Serial.println(recognizedText);
  return recognizedText;
}

// Play audio data using MAX98357A
void playAudio(uint8_t* audioData, size_t audioDataSize) {
  if (audioDataSize > 0) {
    // 发送
    size_t bytes_written = 0;
    i2s_write(I2S_OUT_PORT, (int16_t*)audioData, audioDataSize, &bytes_written, portMAX_DELAY);
  }
}

void clearAudio(void) {
  // 清空I2S DMA缓冲区
  i2s_zero_dma_buffer(I2S_OUT_PORT);
  Serial.print("clearAudio");
}