【语音识别】【本地RAG大模型语音输入解决方案】调用在线API

一.选取在线API

这里我选用了调用在线API的方式进行语音输入的识别处理

【不推荐百度识别 API】【推荐阿里识别 API】

二.地址及代码Demo

阿里百炼控制台：https://bailian.console.aliyun.com/?spm=a2c4g.11186623.0.0.266b2562QyJKfm#/home

代码Demo(FastAPI调用)

import json

import os

import shutil

from fastapi import FastAPI, File, UploadFile, APIRouter

from dashscope import MultiModalConversation

import dashscope

from src.utils import setup_logger

# 初始化日志

logger = setup_logger("server-audio")

# 设置文件上传的目录

UPLOAD_DIR = r"xxx"

os.makedirs(UPLOAD_DIR, exist_ok=True)

# 设置 DashScope API Key

dashscope.api_key = "xxx"

# 定义路由

audio = APIRouter(prefix="/audio")

@audio.post("/upload")

async def create_audio_file(file: UploadFile = File(...)):

def load_file(file):

# 生成文件名

filename = f"{file.filename}_{os.urandom(8).hex()}.wav"

file_path = os.path.join(UPLOAD_DIR, filename) # 使用 os.path.join 拼接路径

# 保存文件

with open(file_path, "wb") as buffer:

shutil.copyfileobj(file.file, buffer)

return {"filename": filename, "file_path": file_path}

def convert(file_path):

# 确保路径格式正确

full_path = r"file://" + file_path.replace("\\", "/") # 替换反斜杠为正斜杠

messages = [

{

"role": "user",

"content": [{"audio": full_path}],

}

]

response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages)

response_dict = json.loads(str(response))

if response_dict.get("output") and response_dict["output"].get("choices"):

choices = response_dict["output"]["choices"]

if choices and choices[0].get("message") and choices[0]["message"].get("content"):

content = choices[0]["message"]["content"]

if content and content[0].get("text"):

text_content = content[0]["text"]

return text_content

else:

logger.error("No text content found in response.")

else:

logger.error("No choices or message found in response.")

else:

logger.error("No output found in response.")

return None

# 加载文件并获取文件路径

file_info = load_file(file)

file_path = file_info.get("file_path")

# 调用转换函数

return convert(file_path)