目标:实现完整的语音交互功能,包含录音、语音识别(STT)、AI对话回复、 语音合成(TTS)和音频播放的全流程
分为两个版本:
-
基础版本(非流式)
文件名:voice_interaction.py
功能:录音 → 语音识别 → AI对话 → 语音合成 → 播放,全部在内存中完成,不写入磁盘。 -
流式版本
文件名:streaming_voice_interaction.py
功能:支持声音活动检测(VAD)、流式录音、流式识别、流式AI回复、流式TTS播放,适合实时交互场景。
| 功能模块 | 非流式 (voice_interaction.py) |
流式 (streaming_voice_interaction.py) |
|---|---|---|
| 录音 | 固定5秒,不管有没有说完 | 实时检测声音活动(VAD),静音3秒自动停 |
| 识别 | 录完整个音频后一次性识别 | 录完后一次性识别(目前未用真流式STT) |
| AI对话 | 等AI完整回复再显示 | 流式输出,逐字打印,像ChatGPT打字 |
| TTS合成 | 等整段文本合成完再播 | 边接收音频流边播,几乎无延迟 |
| 播放 | 播放完整音频 | 实时播放流式音频 |
| 线程控制 | 无 | 使用 threading 和 queue 实现异步录音 |
| 用户体验 | 机械、卡顿、等待时间长 | 自然、流畅、接近真人对话 |
!sudo apt-get install -y portaudio19-dev
!pip install PyAudio
非流式:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
语音交互核心模块
功能:实现录音、语音识别(STT)、AI对话、语音合成(TTS)和音频播放的完整流程
优化:不存储任何音频文件,所有音频在内存中处理
"""
import pyaudio
import wave
import requests
import json
import re
from io import BytesIO
# ==================== 1. 配置管理 ====================
class Config:
SILICONFLOW_API_KEY = "sk-xxxxxxxxxxxxx"
API_BASE_URL = "https://api.siliconflow.cn/v1"
STT_MODEL = "FunAudioLLM/SenseVoiceSmall"
TTS_MODEL = "FunAudioLLM/CosyVoice2-0.5B"
CHAT_MODEL = "THUDM/glm-4-9b-chat"
SAMPLE_RATE = 16000
CHANNELS = 1
DURATION = 5
# ==================== 文本处理工具 ====================
def clean_text_for_tts(text):
text = re.sub(r'\*\*', '', text)
text = re.sub(r'\*', '', text)
text = re.sub(r'`', '', text)
text = re.sub(r'[\[\]\{\}]', '', text)
text = re.sub(r'[;:]', ',', text)
text = re.sub(r'[^\w\s,。,.?!?!()]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
return text
# ==================== 2. 录音功能 ====================
def record_audio():
p = pyaudio.PyAudio()
stream = p.open(format=pyaudio.paInt16,
channels=Config.CHANNELS,
rate=Config.SAMPLE_RATE,
input=True,
frames_per_buffer=1024)
print(f"正在录音...({Config.DURATION}秒后自动停止)")
frames = []
for _ in range(int(Config.SAMPLE_RATE / 1024 * Config.DURATION)):
data = stream.read(1024)
frames.append(data)
print("录音结束")
stream.stop_stream()
stream.close()
p.terminate()
wav_buffer = BytesIO()
with wave.open(wav_buffer, 'wb') as wf:
wf.setnchannels(Config.CHANNELS)
wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
wf.setframerate(Config.SAMPLE_RATE)
wf.writeframes(b''.join(frames))
wav_buffer.seek(0)
return wav_buffer
# ==================== 3. 语音识别(STT) ====================
def speech_to_text(audio_buffer):
url = f"{Config.API_BASE_URL}/audio/transcriptions"
headers = {"Authorization": f"Bearer {Config.SILICONFLOW_API_KEY}"}
files = {
'file': ('recording.wav', audio_buffer, 'audio/wav'),
'model': (None, Config.STT_MODEL),
'language': (None, 'zh-CN'),
'response_format': (None, 'json')
}
try:
response = requests.post(url, headers=headers, files=files)
response.raise_for_status()
return response.json().get('text', "未识别到内容")
except Exception as e:
return f"语音识别失败: {str(e)}"
# ==================== 4. 语音合成(TTS) ====================
def text_to_speech(text):
cleaned_text = clean_text_for_tts(text)
if not cleaned_text:
print("清理后无有效文本,无法合成语音")
return None
url = f"{Config.API_BASE_URL}/audio/speech"
headers = {
"Authorization": f"Bearer {Config.SILICONFLOW_API_KEY}",
"Content-Type": "application/json"
}
data = {
"model": Config.TTS_MODEL,
"input": cleaned_text,
"gain": 0,
"speed": 1,
"stream": False,
"voice": "FunAudioLLM/CosyVoice2-0.5B:anna",
"response_format": "wav"
}
try:
response = requests.post(url, headers=headers, json=data)
response.raise_for_status()
audio_buffer = BytesIO()
audio_buffer.write(response.content)
audio_buffer.seek(0)
return audio_buffer
except Exception as e:
print(f"语音合成失败: {str(e)}")
return None
# ==================== 5. 音频播放 ====================
def play_audio(audio_buffer):
if not audio_buffer:
print("无有效音频数据,无法播放")
return
try:
wf = wave.open(audio_buffer, 'rb')
p = pyaudio.PyAudio()
stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
channels=wf.getnchannels(),
rate=wf.getframerate(),
流式语音交互技术解析

最低0.47元/天 解锁文章
1414

被折叠的 条评论
为什么被折叠?



