一.选取在线API
这里我选用了调用在线API的方式进行语音输入的识别处理
【不推荐 百度识别 API】【推荐 阿里识别 API】
二.地址及代码Demo
阿里百炼控制台:https://bailian.console.aliyun.com/?spm=a2c4g.11186623.0.0.266b2562QyJKfm#/home
代码Demo(FastAPI调用)
import json
import os
import shutil
from fastapi import FastAPI, File, UploadFile, APIRouter
from dashscope import MultiModalConversation
import dashscope
from src.utils import setup_logger
# 初始化日志
logger = setup_logger("server-audio")
# 设置文件上传的目录
UPLOAD_DIR = r"xxx"
os.makedirs(UPLOAD_DIR, exist_ok=True)
# 设置 DashScope API Key
dashscope.api_key = "xxx"
# 定义路由
audio = APIRouter(prefix="/audio")
@audio.post("/upload")
async def create_audio_file(file: UploadFile = File(...)):
def load_file(file):
# 生成文件名
filename = f"{file.filename}_{os.urandom(8).hex()}.wav"
file_path = os.path.join(UPLOAD_DIR, filename) # 使用 os.path.join 拼接路径
# 保存文件
with open(file_path, "wb") as buffer:
shutil.copyfileobj(file.file, buffer)
return {"filename": filename, "file_path": file_path}
def convert(file_path):
# 确保路径格式正确
full_path = r"file://" + file_path.replace("\\", "/") # 替换反斜杠为正斜杠
messages = [
{
"role": "user",
"content": [{"audio": full_path}],
}
]
response = MultiModalConversation.call(model="qwen-audio-asr", messages=messages)
response_dict = json.loads(str(response))
if response_dict.get("output") and response_dict["output"].get("choices"):
choices = response_dict["output"]["choices"]
if choices and choices[0].get("message") and choices[0]["message"].get("content"):
content = choices[0]["message"]["content"]
if content and content[0].get("text"):
text_content = content[0]["text"]
return text_content
else:
logger.error("No text content found in response.")
else:
logger.error("No choices or message found in response.")
else:
logger.error("No output found in response.")
return None
# 加载文件并获取文件路径
file_info = load_file(file)
file_path = file_info.get("file_path")
# 调用转换函数
return convert(file_path)