"""
【语音识别模块】Speech Recognition (Offline)
使用麦克风进行实时语音识别,基于 Vosk 离线模型
支持单次识别 & 持续监听模式
"""
import threading
import time
import logging
import json
import numpy as np
from database import config
from Progress.utils.logger_utils import log_time, log_step, log_var, log_call
from Progress.utils.logger_config import setup_logger
from vosk import Model, KaldiRecognizer
import pyaudio
# --- 配置参数 ---
VOICE_TIMEOUT = config.timeout # 最大等待语音输入时间(秒)
VOICE_PHRASE_TIMEOUT = config.phrase_timeout # 单句话最长录音时间
VOICE_RECOGNITION_LANGUAGE = config.lang # 如 'zh-CN', 'en-US'
VOSK_MODEL_PATH = "./vosk-model-small-cn-0.22"
# --- 初始化日志器 ---
logger = logging.getLogger("ai_assistant")
class SpeechRecognizer:
def __init__(self):
self.model = None
self.recognizer = None
self.audio = None
self.is_listening = False
self.callback = None # 用户注册的回调函数:callback(text)
self._last_text = ""
self._listen_thread = None
self.sample_rate = 16000 # Vosk 要求采样率 16kHz
self.chunk_size = 8000 # 每次读取帧大小(字节),可根据性能调整
self._load_model()
self._calibrate_noise()
@log_step("加载 Vosk 离线模型")
@log_time
def _load_model(self):
"""加载本地 Vosk 模型"""
try:
logger.info(f"📦 正在加载模型: {VOSK_MODEL_PATH}")
self.model = Model(VOSK_MODEL_PATH)
log_call("✅ 模型加载成功")
except Exception as e:
logger.critical(f"🔴 加载 Vosk 模型失败,请确认路径正确并下载模型: {e}")
raise RuntimeError("Failed to load Vosk model") from e
@log_step("校准环境噪音(初始化音频流)")
@log_time
def _calibrate_noise(self):
"""启动音频流并准备识别器"""
try:
self.audio = pyaudio.PyAudio()
stream = self.audio.open(
format=pyaudio.paInt16,
channels=1,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk_size
)
# 创建识别器
self.recognizer = KaldiRecognizer(self.model, self.sample_rate)
stream.close() # 关闭临时流
logger.debug("✅ 音频系统初始化完成")
except Exception as e:
logger.exception("❌ 初始化音频失败")
raise
@property
def last_text(self) -> str:
return self._last_text
def is_available(self) -> bool:
"""检查麦克风是否可用"""
try:
temp_stream = self.audio.open(
format=pyaudio.paInt16,
channels=1,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk_size
)
temp_stream.close()
return True
except Exception as e:
logger.error(f"🔴 麦克风不可用: {e}")
return False
@log_step("执行单次语音识别")
@log_time
def listen_and_recognize(self, timeout=None) -> str:
timeout = timeout or VOICE_TIMEOUT
start_time = time.time()
in_speech = False
result_text = ""
self.recognizer.Reset()
logger.debug(f"🎙️ 开始单次语音识别 (timeout={timeout})...")
logger.info("🔊 请说话...")
try:
stream = self.audio.open(
format=pyaudio.paInt16,
channels=1,
rate=self.sample_rate,
input=True,
frames_per_buffer=1600 # 更小块,更快响应
)
except Exception as e:
logger.error(f"🔴 无法打开音频流: {e}")
return ""
while (time.time() - start_time) < timeout and self.is_listening:
try:
data = stream.read(1600, exception_on_overflow=False)
# 分析音量(可选调试)
audio_np = np.frombuffer(data, dtype=np.int16)
volume = np.abs(audio_np).mean()
if volume < 30:
logger.debug(f"🔇 音量过低: {volume:.1f}")
# 送入 Vosk
if self.recognizer.AcceptWaveform(data):
final_result = json.loads(self.recognizer.Result())
text = final_result.get("text", "").strip()
if text:
result_text = text
break
else:
# 检查是否有部分语音
partial = json.loads(self.recognizer.PartialResult())
partial_text = partial.get("partial", "")
if partial_text.strip():
in_speech = True # 标记已经开始说话
except Exception as e:
logger.exception("读取音频出错")
break
# 如果一直没有说话,允许超时;否则继续等待说完
if not in_speech and (time.time() - start_time) > timeout:
logger.info("💤 超时未检测到语音")
break
log_call("当前时间为:"+str(time.time()))
stream.stop_stream()
stream.close()
if result_text:
self._last_text = result_text
logger.info(f"🎯 识别结果: '{result_text}'")
return result_text
else:
logger.info("❓ 未识别到有效内容")
self._last_text = ""
return ""
@log_step("启动持续语音监听")
def start_listening(self, callback=None, language=None):
"""
启动后台线程持续监听语音输入
:param callback: 回调函数,接受一个字符串参数 text
:param language: 语言代码(忽略,由模型决定)
"""
if self.is_listening:
logger.warning("⚠️ 已在监听中,忽略重复启动")
return
if callback:
self.callback = callback
self.is_listening = True
self._listen_thread = threading.Thread(target=self._background_listen, args=(language,), daemon=True)
self._listen_thread.start()
logger.info("🟢 已启动后台语音监听")
@log_step("停止语音监听")
def stop_listening(self):
"""安全停止后台监听"""
if not self.is_listening:
return
self.is_listening = False
logger.info("🛑 正在停止语音监听...")
if self._listen_thread and self._listen_thread != threading.current_thread():
self._listen_thread.join(timeout=3)
if self._listen_thread.is_alive():
logger.warning("🟡 监听线程未能及时退出(可能阻塞)")
elif self._listen_thread == threading.current_thread():
logger.error("❌ 无法在当前线程中 join 自己!请检查调用栈")
else:
logger.debug("No thread to join")
logger.info("✅ 语音监听已停止")
@log_time
def _background_listen(self, language=None):
"""后台循环监听线程"""
logger.debug("🎧 后台监听线程已启动")
try:
stream = self.audio.open(
format=pyaudio.paInt16,
channels=1,
rate=self.sample_rate,
input=True,
frames_per_buffer=self.chunk_size
)
self.recognizer.Reset()
except Exception as e:
logger.error(f"🔴 无法打开音频流: {e}")
return
while self.is_listening:
try:
data = stream.read(self.chunk_size, exception_on_overflow=False)
if self.recognizer.AcceptWaveform(data):
result_json = self.recognizer.Result()
result_dict = json.loads(result_json)
text = result_dict.get("text", "").strip()
if text and self.callback:
logger.info(f"🔔 回调触发: '{text}'")
self.callback(text)
except Exception as e:
logger.exception("Background listening error")
time.sleep(0.1)
stream.stop_stream()
stream.close()
logger.debug("🔚 后台监听线程退出")
def on_recognized(text):
print(f"\n🔔 回调收到: '{text}'")
if "退出" in text or "停止" in text:
recognizer.stop_listening()
给你看一下整个代码
"""
【AI语音助手】主程序入口
集成语音识别、Qwen 意图理解、TTS 与动作执行
✅ 已修复:不再访问 _last_text 私有字段
✅ 增强:异常防护、类型提示、唤醒词预留接口
"""
import sys
import time
import logging
# --- 导入日志工具 ---
from Progress.utils.logger_config import setup_logger
from Progress.utils.logger_utils import log_time, log_step, log_var, log_call
# --- 显式导入各模块核心类 ---
from Progress.app.voice_recognizer import SpeechRecognizer
from Progress.app.qwen_assistant import QWENAssistant
from Progress.app.text_to_speech import TTSEngine
from Progress.app.system_controller import SystemController, TaskOrchestrator
from database import config
# --- 初始化全局日志器 ---
logger = logging.getLogger("ai_assistant")
@log_step("初始化语音识别模块")
@log_time
def initialize_speech_recognizer() -> SpeechRecognizer:
try:
recognizer = SpeechRecognizer()
if not recognizer.is_available():
raise RuntimeError("麦克风不可用,请检查设备连接和权限")
log_call("✅ 语音识别器初始化完成")
return recognizer
except Exception as e:
logger.critical(f"🔴 初始化语音识别失败: {e}")
raise
@log_step("初始化 AI 助手模块")
@log_time
def initialize_qwen_assistant() -> QWENAssistant:
try:
assistant = QWENAssistant()
log_call("✅ Qwen 助手初始化完成")
return assistant
except Exception as e:
logger.critical(f"🔴 初始化 Qwen 助手失败: {e}")
raise
@log_step("初始化文本转语音模块")
@log_time
def initialize_tts_engine() -> TTSEngine:
try:
tts_engine = TTSEngine()
if not tts_engine.is_available():
raise RuntimeError("TTS引擎初始化失败")
log_call("✅ TTS 引擎初始化完成")
return tts_engine
except Exception as e:
logger.critical(f"🔴 初始化 TTS 失败: {e}")
raise
@log_step("初始化动作执行器")
@log_time
def initialize_action_executor() -> TaskOrchestrator:
system_controller = SystemController()
task_orchestrator = TaskOrchestrator(system_controller=system_controller)
log_call("✅ 动作执行器初始化完成")
return task_orchestrator
@log_step("安全执行单次交互")
@log_time
def handle_single_interaction_safe(
recognizer: SpeechRecognizer,
assistant: QWENAssistant,
tts_engine: TTSEngine,
executor: TaskOrchestrator
):
try:
handle_single_interaction(recognizer, assistant, tts_engine, executor)
except Exception as e:
logger.exception("⚠️ 单次交互过程中发生异常,已降级处理")
error_msg = "抱歉,我在处理刚才的操作时遇到了一点问题。"
logger.info(f"🗣️ 回复: {error_msg}")
tts_engine.speak(error_msg)
@log_step("处理一次语音交互")
@log_time
def handle_single_interaction(
recognizer: SpeechRecognizer,
assistant: QWENAssistant,
tts_engine: TTSEngine,
executor: TaskOrchestrator
):
# 1. 听
voice_text = recognizer.listen_and_recognize()
if not voice_text:
response = "抱歉,我没有听清楚,请重新说话。"
logger.info(f"🗣️ 回复: {response}")
tts_engine.speak(response)
return
log_var("🎤 识别到的语音文本", voice_text)
# 2. 理解
ai_response = assistant.process_voice_command(voice_text)
ai_reply = ai_response.get("response", "好的,已处理。")
intent = ai_response.get("intent")
action = ai_response.get("action")
params = ai_response.get("parameters")
log_var("🧠 AI响应.intent", intent)
log_var("🧠 AI响应.action", action)
log_var("🧠 AI响应.parameters", params)
# 3. 执行(若无需确认)
if not ai_response.get("needs_confirmation", False):
try:
success = executor.execute(ai_response)
if not success:
ai_reply = "执行该操作时遇到了一些问题。"
except Exception as e:
logger.exception("💥 执行动作时发生异常")
ai_reply = "抱歉,我在尝试执行这个操作时出了点问题。"
# 4. 说
logger.info(f"🗣️ 回复: {ai_reply}")
tts_engine.speak(ai_reply)
@log_step("启动 AI 语音助手")
@log_time
def main():
logger.info("🚀 正在启动 AI 语音助手系统...")
try:
recognizer = initialize_speech_recognizer()
assistant = initialize_qwen_assistant()
tts_engine = initialize_tts_engine()
executor = initialize_action_executor()
log_call("✅ 所有模块初始化完成,进入监听循环")
print("\n" + "—" * 50)
print("🎙️ 语音助手已就绪")
print("💡 说出你的命令,例如:'打开浏览器'、'写一篇春天的文章'")
print("🛑 说出‘退出’、‘关闭’、‘停止’或‘拜拜’来结束程序")
print("—" * 50 + "\n")
while True:
handle_single_interaction_safe(recognizer, assistant, tts_engine, executor)
last_text = recognizer.last_text.lower()
exit_keywords = ['退出', '关闭', '停止', '拜拜', '再见']
if any(word in last_text for word in exit_keywords):
logger.info("🎯 用户请求退出,程序即将终止")
break
time.sleep(0.5)
logger.info("👋 语音助手已安全退出")
except KeyboardInterrupt:
logger.info("🛑 用户通过 Ctrl+C 中断程序")
print("\n👋 再见!")
except Exception as e:
logger.exception("❌ 主程序运行时发生未预期异常")
print(f"\n🚨 程序异常终止:{e}")
sys.exit(1)
if __name__ == "__main__":
if not logging.getLogger().handlers:
setup_logger(name="ai_assistant", log_dir="logs", level=logging.INFO)
main()
""" # 使用配置(推荐方式)
#增加或修改新的设置配置
config.update_key("shortcuts",key = "exit",value = "Ctrl+C")
config.update_key("shortcuts",key = "select_all",value = "Shift+Alt+A")
#修改资源路径
config.set_resource_path("./resoures") """
最新发布