import os
import ctypes
import tkinter as tk
from tkinter import ttk, messagebox
import sys
import requests
import json
import re
import faiss
import numpy as np
import sounddevice as sd
import queue
import threading
import time
import json
from vosk import Model, KaldiRecognizer
os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
from sentence_transformers import SentenceTransformer
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import (
PyPDFLoader,
Docx2txtLoader,
TextLoader,
UnstructuredExcelLoader
)
# Set the address of the Server.
server_url = 'http://127.0.0.1:8080/rkllm_chat'
# Create a session object.
session = requests.Session()
session.keep_alive = False # Close the connection pool to maintain a long connection.
adapter = requests.adapters.HTTPAdapter(max_retries=5)
session.mount('https://', adapter)
session.mount('http://', adapter)
# Set the dynamic library path
xtts_lib = ctypes.CDLL('./aikit_xtts.so')
# Define the structures from the library
xtts_lib.ai_xtts.restype = ctypes.c_int # 设置返回类型为整型
xtts_lib.ai_xtts.argtypes = [ctypes.c_char_p] # 设置参数类型为char指针(C风格的字符串)
class FileBasedVectorDB:
def __init__(self, embedding_model="sentence-transformers/all-MiniLM-L6-v2"):
# 初始化文本嵌入模型
self.embedder = SentenceTransformer(embedding_model)
self.dimension = self.embedder.get_sentence_embedding_dimension()
# 创建 FAISS 索引
self.index = None
self.documents = [] # 存储原始文本片段
self.metadata = [] # 存储文档元数据
def import_from_folder(self, folder_path: str, file_extensions=None):
"""从文件夹导入所有指定类型的文件"""
if file_extensions is None:
file_extensions = ['.pdf', '.docx', '.doc', '.txt', '.xlsx', '.xls']
print(f"开始扫描文件夹: {folder_path}")
file_paths = []
for root, _, files in os.walk(folder_path):
for file in files:
if os.path.splitext(file)[1].lower() in file_extensions:
file_paths.append(os.path.join(root, file))
print(f"找到 {len(file_paths)} 个符合要求的文件")
return self.process_files(file_paths)
def import_from_filelist(self, file_list_path: str):
"""从文件列表导入文档"""
if not os.path.exists(file_list_path):
raise FileNotFoundError(f"文件列表不存在: {file_list_path}")
with open(file_list_path, 'r') as f:
file_paths = [line.strip() for line in f if line.strip()]
print(f"从文件列表导入 {len(file_paths)} 个文件")
return self.process_files(file_paths)
def process_files(self, file_paths: list):
"""处理文件列表并提取文本"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=200
)
for path in file_paths:
if not os.path.exists(path):
print(f"⚠️ 文件不存在: {path}")
continue
file_ext = os.path.splitext(path)[1].lower()
print(f"处理文件: {path} ({file_ext})")
try:
# 根据文件类型选择加载器
if file_ext == '.pdf':
loader = PyPDFLoader(path)
elif file_ext in ['.doc', '.docx']:
loader = Docx2txtLoader(path)
elif file_ext == '.txt':
loader = TextLoader(path)
elif file_ext in ['.xlsx', '.xls']:
loader = UnstructuredExcelLoader(path)
else:
print(f"⚠️ 不支持的文件格式: {file_ext}")
continue
# 加载并分割文档
raw_docs = loader.load()
chunks = text_splitter.split_documents(raw_docs)
# 存储文档片段和元数据
for idx, chunk in enumerate(chunks):
self.documents.append(chunk.page_content)
self.metadata.append({
"source": os.path.basename(path),
"path": path,
"chunk_id": idx,
"type": file_ext[1:]
})
print(f" ├─提取 {len(chunks)} 个文本片段")
except Exception as e:
print(f" ⚠️ 处理错误: {str(e)}")
print(f"总共提取 {len(self.documents)} 个文本片段")
return len(self.documents)
def build_index(self, index_type="flat"):
"""构建向量索引"""
if not self.documents:
print("❌ 没有可处理的文档")
return None
# 生成文档嵌入向量
print("生成文本嵌入向量...")
embeddings = self.embedder.encode(self.documents, show_progress_bar=True)
# 创建索引
print(f"创建 {index_type.upper()} 索引...")
if index_type == "flat":
self.index = faiss.IndexFlatL2(self.dimension)
elif index_type == "ivfflat":
quantizer = faiss.IndexFlatL2(self.dimension)
self.index = faiss.IndexIVFFlat(quantizer, self.dimension, 127)
self.index.train(embeddings) # IVF索引需要训练
elif index_type == "hnsw":
self.index = faiss.IndexHNSWFlat(self.dimension, 32)
# 添加到索引
self.index.add(np.array(embeddings).astype('float32'))
print(f"✅ 索引构建完成,包含 {len(self.documents)} 个向量")
return self.index
def save_index(self, save_dir: str):
"""保存索引和元数据"""
if not os.path.exists(save_dir):
os.makedirs(save_dir)
# 保存FAISS索引
faiss.write_index(self.index, os.path.join(save_dir, "vector_index.index"))
# 保存元数据(使用CSV格式方便查看)
import csv
with open(os.path.join(save_dir, "metadata.csv"), 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=["source", "path", "chunk_id", "type"])
writer.writeheader()
writer.writerows(self.metadata)
# 保存文档片段(可选)
with open(os.path.join(save_dir, "documents.txt"), 'w', encoding='utf-8') as f:
for doc in self.documents:
f.write(doc.replace('\n', ' ') + '\n')
print(f"💾 索引保存到: {save_dir}")
def load_index(self, save_dir: str):
"""加载已有索引"""
# 加载FAISS索引
self.index = faiss.read_index(os.path.join(save_dir, "vector_index.index"))
# 加载元数据
import csv
self.metadata = []
with open(os.path.join(save_dir, "metadata.csv"), 'r', encoding='utf-8') as f:
reader = csv.DictReader(f)
for row in reader:
self.metadata.append(row)
# 加载文档片段
self.documents = []
if os.path.exists(os.path.join(save_dir, "documents.txt")):
with open(os.path.join(save_dir, "documents.txt"), 'r', encoding='utf-8') as f:
self.documents = [line.strip() for line in f]
print(f"🔍 加载索引成功,包含 {len(self.metadata)} 个文档片段")
def search(self, query: str, top_k: int = 5):
"""向量相似度搜索"""
if self.index is None:
raise RuntimeError("索引未初始化,请先加载或构建索引")
query_embed = self.embedder.encode([query])
distances, indices = self.index.search(query_embed, top_k)
results = []
for idx, dist in zip(indices[0], distances[0]):
results.append({
"text": self.documents[idx],
"metadata": self.metadata[idx],
"distance": float(dist)
})
return results
class VoiceAssistantApp:
def __init__(self, root, vector_db, result_queue):
self.root = root
self.vector_db = vector_db
self.root.title("大牧人AI助手")
self.root.geometry("1024x768")
self.root.resizable(True, True)
# 创建主界面框架
self.create_main_frame()
# 创建参数存储字典
self.settings = {
"voice_engine": "Google",
"language": "中文",
"hotword": "你好,牧牧",
"volume": 70,
"response_speed": 3
}
# 加载模型
self.model = Model("vosk-model-small-cn-0.22") # 使用自己下载好的模型路径
self.recognizer = KaldiRecognizer(self.model, 16000)
self.recognizer.SetWords(["AI", "人工智能"]) # 添加自定义词汇
# 音频队列和流
self.audio_queue = queue.Queue()
self.audio_stream = None
self.result_queue = result_queue
# 线程控制
self.listening = False
self.processing_thread = None
self.stop_event = threading.Event()
self.last_sound_time = 0;
def intelligent_postprocess(self, text):
# 字母标准化
letter_map = {'A':'诶','B':'必','C':'西','D':'迪','E':'伊','F':'艾弗',
'G':'吉','H':'艾尺','I':'艾','J':'杰','K':'开','L':'艾勒',
'M':'艾姆','N':'艾娜','O':'哦','P':'屁','Q':'克由','R':'艾儿',
'S':'艾丝','T':'提','U':'伊吾','V':'维','W':'达布溜','X':'艾克斯',
'Y':'歪','Z':'贼德'}
# 处理字母单词组合 (如 "AI系统" -> "诶艾系统")
processed = ""
buffer = ""
for char in text:
if char.isupper():
buffer += char
else:
if buffer:
# 检查是否是常见缩写词
if buffer in ["AI", "CPU", "GPU"]:
processed += "".join(letter_map[c] for c in buffer)
else:
processed += letter_map.get(buffer, buffer)
buffer = ""
processed += char
# 上下文修正
corrections = {
"西提艾": "CPU",
"记屁优": "GPU",
"诶艾": "AI"
}
for wrong, correct in corrections.items():
processed = processed.replace(wrong, correct)
return processed
def audio_callback(self, indata, frames, time, status):
"""音频回调函数,将数据放入队列"""
if status:
print(f"音频错误: {status}")
try:
# 计算当前块的RMS值(音量)
rms = np.sqrt(np.mean(indata**2))
self.audio_queue.put(bytes(indata))
# 检测是否静音
is_silent = rms < SILENCE_THRESHOLD
# 更新最后检测到声音的时间
if not is_silent:
self.last_sound_time = time.time()
except Exception as e:
error_msg = f"音频队列错误: {str(e)}"
print(error_msg)
def create_main_frame(self):
# 主界面框架
self.main_frame = ttk.Frame(self.root, padding=20)
self.main_frame.pack(fill=tk.BOTH, expand=True)
# 标题标签
title_label = ttk.Label(
self.main_frame,
text="大牧人AI助手",
font=("微软雅黑", 24, "bold"),
foreground="#2c3e50"
)
title_label.pack(pady=20)
# 状态显示区域
status_frame = ttk.LabelFrame(self.main_frame, text="系统状态")
status_frame.pack(fill=tk.X, padx=10, pady=10)
self.status_label = ttk.Label(
status_frame,
text="准备就绪",
foreground="green"
)
self.status_label.pack(pady=5)
# 交互记录文本框
log_frame = ttk.LabelFrame(self.main_frame, text="交互记录")
log_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
self.log_text = tk.Text(
log_frame,
height=10,
wrap=tk.WORD,
state=tk.NORMAL
)
self.log_text.pack(fill=tk.BOTH, expand=True, padx=5, pady=5)
self.log_text.insert(tk.END, "系统启动...\n")
self.log_text.config(state=tk.DISABLED)
scrollbar = ttk.Scrollbar(log_frame, command=self.log_text.yview)
scrollbar.pack(side=tk.RIGHT, fill=tk.Y)
self.log_text.config(yscrollcommand=scrollbar.set)
# 输入区域
self.create_input_fields()
# 控制按钮区域
button_frame = ttk.Frame(self.main_frame)
button_frame.pack(fill=tk.X, padx=10, pady=20)
self.start_btn = ttk.Button(
button_frame,
text="启动监听",
command=self.toggle_listening,
width=15
)
self.start_btn.pack(side=tk.LEFT, padx=10)
settings_btn = ttk.Button(
button_frame,
text="参数设置",
command=self.open_settings,
width=15
)
settings_btn.pack(side=tk.LEFT, padx=10)
clear_btn = ttk.Button(
button_frame,
text="清空记录",
command=self.clear_log,
width=15
)
clear_btn.pack(side=tk.LEFT, padx=10)
exit_btn = ttk.Button(
button_frame,
text="退出系统",
command=self.root.quit,
width=15
)
exit_btn.pack(side=tk.RIGHT, padx=10)
# 监听状态标志
self.listening = False
def create_input_fields(self):
self.input_entry = ttk.Entry(self.main_frame, width=25)
self.input_entry.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
def rkllm_chat(self, user_message):
is_streaming = True
try:
if user_message == "exit":
print("============================")
print("The RKLLM Server is stopping......")
print("============================")
else:
# Set the request headers; in this case, the headers have no actual effect and are only used to simulate the OpenAI interface design.
headers = {
'Content-Type': 'application/json',
'Authorization': 'not_required'
}
# Prepare the data to be sent
# model: The model defined by the user when setting up RKLLM-Server; this has no effect here
# messages: The user's input question, which RKLLM-Server will use as input and return the model's reply; multiple questions can be added to messages
# stream: Whether to enable streaming conversation, similar to the OpenAI interface
data = {
"model": 'your_model_deploy_with_RKLLM_Server',
"messages": [{"role": "user", "content": user_message}],
"stream": is_streaming,
"enable_thinking": False,
"tools": None
}
# Send a POST request
responses = session.post(server_url, json=data, headers=headers, stream=is_streaming, verify=False)
if not is_streaming:
# Parse the response
if responses.status_code == 200:
print("Q:", data["messages"][-1]["content"])
print("A:", json.loads(responses.text)["choices"][-1]["message"]["content"])
else:
print("Error:", responses.text)
else:
if responses.status_code == 200:
print("Q:", data["messages"][-1]["content"])
print("A:", end="")
answer = "牧牧:"
for line in responses.iter_lines():
if line:
line = json.loads(line.decode('utf-8'))
if line["choices"][-1]["finish_reason"] != "stop":
print(line["choices"][-1]["delta"]["content"], end="")
answer+=line["choices"][-1]["delta"]["content"]
sys.stdout.flush()
self.add_log(answer)
byte = answer.encode('utf-8')
result_xtts = xtts_lib.ai_xtts(byte) # 注意:需要传递bytes类型的数据
else:
print('Error:', responses.text)
except KeyboardInterrupt:
# Capture Ctrl-C signal to close the session
session.close()
print("\n")
print("============================")
print("The RKLLM Server is stopping......")
print("============================")
def toggle_listening(self):
"""切换语音监听状态"""
self.listening = not self.listening
if self.listening:
self.start_btn.config(text="停止监听")
self.status_label.config(text="正在监听...", foreground="red")
self.add_log("开始语音监听...")
# 重置识别器以确保状态清除
self.recognizer = KaldiRecognizer(self.model, 16000)
self.recognizer.SetWords(["AI", "人工智能"])
# 启动音频流
self.audio_stream = sd.RawInputStream(
samplerate=16000,
blocksize=4000,
dtype='int16',
channels=1,
callback=self.audio_callback
)
self.audio_stream.start()
self.add_log("音频流已启动")
# 启动处理线程
self.stop_event.clear()
self.processing_thread = threading.Thread(target=self.process_audio)
self.processing_thread.daemon = True
self.processing_thread.start()
self.add_log("音频处理线程已启动")
else:
# 停止监听
self.listening = False
self.start_btn.config(text="启动监听")
self.status_label.config(text="准备就绪", foreground="green")
self.add_log("停止语音监听")
# 停止音频流
if self.audio_stream:
self.audio_stream.stop()
self.audio_stream.close()
self.audio_stream = None
self.add_log("音频流已关闭")
# 通知处理线程停止
self.stop_event.set()
if self.processing_thread and self.processing_thread.is_alive():
self.processing_thread.join(timeout=1.0)
def process_audio(self):
"""处理音频队列的线程函数"""
self.add_log("音频处理线程启动")
print("音频处理线程启动")
# 添加超时计数器
no_data_counter = 0
MAX_NO_DATA = 20 # 10秒无数据超时
audio_data = []
while not self.stop_event.is_set():
try:
# 非阻塞获取音频数据(等待1秒)
if not self.audio_queue.empty():
data = self.audio_queue.get()
audio_data.append(data)
print(f"\n检测到声音音持续 {self.last_sound_time:.1f} 录音")
current_time = time.time()
silence_duration = current_time - self.last_sound_time
if silence_duration > MAX_NO_DATA:
print(f"\n检测到静音持续 {silence_duration:.1f} 秒,停止录音")
# 如果音频数据不为空,送入识别器
if len(audio_data) > 0:
# 喂给识别器
if self.recognizer.AcceptWaveform(audio_data):
# 获取识别结果
result = json.loads(self.recognizer.Result())
text = result.get("text", "")
if text:
print(f"\n识别结果: {text}")
self.result_queue.put(("full", text))
self.root.after(0, self.process_command, text)
self.stop_event.set()
self.root.after(0, self.toggle_listening) # 自动停止
else:
# 获取部分结果用于调试
partial_result = json.loads(self.recognizer.PartialResult())
partial_text = partial_result.get("partial", "")
if partial_text:
print(f"\r实时识别: {partial_text}", end="", flush=True)
self.result_queue.put(("partial", partial_text))
self.root.after(0, self.process_command, partial_text)
self.stop_event.set()
self.root.after(0, self.toggle_listening) # 自动停止
time.sleep(0.1) # 短暂休眠减少CPU占用
except queue.Empty:
# 队列为空时计数
no_data_counter += 1
if no_data_counter >= MAX_NO_DATA:
self.add_log("音频数据超时,停止监听")
self.root.after(0, self.toggle_listening) # 自动停止
continue
except Exception as e:
error_msg = f"音频处理错误: {str(e)}"
self.add_log(error_msg)
print(error_msg)
import traceback
traceback.print_exc()
self.add_log("音频处理线程退出")
print("音频处理线程退出")
# 清空队列
while not self.audio_queue.empty():
try:
self.audio_queue.get_nowait()
except queue.Empty:
break
def process_command(self, command):
"""处理识别到的语音命令"""
self.add_log(f"用户: {command}")
# 检查唤醒词
#if self.settings["hotword"] in command:
# command = command.replace(self.settings["hotword"], "").strip()
# self.add_log(f"检测到唤醒词: {self.settings['hotword']}")
if not self.result_queue.empty():
# 向量搜索
no_space_command = command.translate(str.maketrans('', '', ' ')) # 删除所有空格
results = self.vector_db.search(no_space_command)
context = ""
for i, res in enumerate(results[:3]): # 取前三相关结果
print(f"\n🔍 结果 #{i+1} (距离: {res['distance']:.3f})")
distance = res['distance']
if distance < 1.5:
context += res['text'][:150] + "\n"
prompt = f"上下文: {context}\n问题: {no_space_command}" if context else no_space_command
self.result_queue.task_done()
self.add_log(f"我: {no_space_command}...")
# 调用AI
self.rkllm_chat(prompt)
def add_log(self, message):
"""添加日志信息"""
self.log_text.config(state=tk.NORMAL)
self.log_text.insert(tk.END, f"> {message}\n")
self.log_text.see(tk.END) # 滚动到底部
self.log_text.config(state=tk.DISABLED)
def clear_log(self):
"""清空日志"""
self.log_text.config(state=tk.NORMAL)
self.log_text.delete(1.0, tk.END)
self.log_text.insert(tk.END, "日志已清空\n")
self.log_text.config(state=tk.DISABLED)
def open_settings(self):
"""打开设置窗口"""
settings_window = tk.Toplevel(self.root)
settings_window.title("参数设置")
settings_window.geometry("500x400")
settings_window.transient(self.root) # 设置为主窗口的临时窗口
settings_window.grab_set() # 模态窗口
# 设置标签框架
settings_frame = ttk.LabelFrame(settings_window, text="语音助手设置")
settings_frame.pack(fill=tk.BOTH, expand=True, padx=20, pady=20)
# 语音引擎设置
ttk.Label(settings_frame, text="语音引擎:").grid(row=0, column=0, padx=10, pady=10, sticky=tk.W)
engine_var = tk.StringVar(value=self.settings["voice_engine"])
engine_combo = ttk.Combobox(
settings_frame,
textvariable=engine_var,
values=["Google", "Baidu", "Microsoft", "Amazon"],
state="readonly",
width=15
)
engine_combo.grid(row=0, column=1, padx=10, pady=10, sticky=tk.W)
# 语言设置
ttk.Label(settings_frame, text="识别语言:").grid(row=1, column=0, padx=10, pady=10, sticky=tk.W)
lang_var = tk.StringVar(value=self.settings["language"])
lang_combo = ttk.Combobox(
settings_frame,
textvariable=lang_var,
values=["中文", "English", "日本語", "Español"],
state="readonly",
width=15
)
lang_combo.grid(row=1, column=1, padx=10, pady=10, sticky=tk.W)
# 唤醒词设置
ttk.Label(settings_frame, text="唤醒词:").grid(row=2, column=0, padx=10, pady=10, sticky=tk.W)
hotword_entry = ttk.Entry(settings_frame, width=20)
hotword_entry.insert(0, self.settings["hotword"])
hotword_entry.grid(row=2, column=1, padx=10, pady=10, sticky=tk.W)
# 音量设置
ttk.Label(settings_frame, text="音量:").grid(row=3, column=0, padx=10, pady=10, sticky=tk.W)
volume_var = tk.IntVar(value=self.settings["volume"])
volume_scale = ttk.Scale(
settings_frame,
from_=0,
to=100,
variable=volume_var,
length=200,
orient=tk.HORIZONTAL
)
volume_scale.grid(row=3, column=1, padx=10, pady=10, sticky=tk.W)
volume_label = ttk.Label(settings_frame, text=f"{self.settings['volume']}%")
volume_label.grid(row=3, column=2, padx=5, sticky=tk.W)
# 响应速度设置
ttk.Label(settings_frame, text="响应速度:").grid(row=4, column=0, padx=10, pady=10, sticky=tk.W)
speed_var = tk.IntVar(value=self.settings["response_speed"])
speed_scale = ttk.Scale(
settings_frame,
from_=1,
to=5,
variable=speed_var,
length=200,
orient=tk.HORIZONTAL
)
speed_scale.grid(row=4, column=1, padx=10, pady=10, sticky=tk.W)
speed_label = ttk.Label(settings_frame, text=f"Level {self.settings['response_speed']}")
speed_label.grid(row=4, column=2, padx=5, sticky=tk.W)
# 更新标签的函数
def update_volume_label(val):
volume_label.config(text=f"{int(float(val))}%")
def update_speed_label(val):
speed_label.config(text=f"Level {int(float(val))}")
volume_scale.config(command=update_volume_label)
speed_scale.config(command=update_speed_label)
# 保存按钮
def save_settings():
self.settings = {
"voice_engine": engine_var.get(),
"language": lang_var.get(),
"hotword": hotword_entry.get(),
"volume": int(volume_var.get()),
"response_speed": int(speed_var.get())
}
messagebox.showinfo("设置保存", "参数已成功保存!")
settings_window.destroy()
self.add_log("更新系统参数设置")
save_btn = ttk.Button(
settings_frame,
text="保存设置",
command=save_settings,
width=15
)
save_btn.grid(row=5, column=1, pady=20)
if __name__ == "__main__":
result_queue = queue.Queue() # 识别结果队列
root = tk.Tk()
vector_db = FileBasedVectorDB()#初始化本地资料向量库
vector_db.load_index("vector_db_storage")#加载本地资料库
app = VoiceAssistantApp(root, vector_db, result_queue)
root.mainloop()
最新发布