要开发一个flask语音识别接口,首先要解决语音文件在网络中的传输问题,然后选识别算法进行识别
1、以二进制文件流方式上次语音
python服务端代码,以flask.request.files接收前端的语音上传请求
from flask import Flask, request
import io
import wave
import os
import json
app = Flask(__name__)
@app.route('/upload_audio', methods=['POST'])
def upload_audio():
"""
#接收语音文件并保存为.wav格式的文件
#:return:
"""
f_obj = request.files.get("file", None)
if f_obj is None:
return json.dumps({'status': 1, 'msg': 'No audio was received.', 'result': ''})
else:
audio_data = f_obj.read()
with open('output.wav', 'ab') as f:
f.write(audio_data) # 追加写入音频数据
return json.dumps({'status': 0, 'msg': '', 'result': 'receive audio success.'})
if __name__ == '__main__':
socketio.run(app, port=8200, debug=True)
前端请求代码示例如下:
import requests
import time
def post_audio():
"""
上传语音文件
:return:
"""
url = "http://localhost:8200/upload_audio"
files = {'file': open('./c1.wav', 'rb')}
t1 = time.time()
r = requests.post(url, files=files)
t2 = time.time()
print("comsume time: %f s"%(t2-t1))
if r.json()['status']:
print(r.json()['msg'])
else:
response = r.json()['result']
print(response)
if __name__ == '__main__':
post_audio()
2、网页端长连接流式上传语音文件
python服务端代码,接收网页端发来的语音片段并保存为.wav格式的语音文件,方便后面的语音识别
from flask import Flask, request, render_template
from flask_socketio import SocketIO, emit
import json
import base64
app = Flask(__name__)
app.config['SECRET_KEY'] = 'secret'
socketio = SocketIO(app, async_mode='eventlet')
# In-memory storage for the audio chunks
audio_chunks = []
@app.route('/')
def index():
return render_template('index.html')
@socketio.on('audio_chunk')
def handle_audio_chunk(data):
global audio_chunks
audio_chunks.append(data)
# Optionally, you can write each chunk to a file here if you prefer not to keep it in memory
with open('audio_chunk.wav', 'ab') as f:
f.write(base64.b64decode(data))
@socketio.on('audio_end')
def handle_audio_end():
global audio_chunks
if audio_chunks:
print("开始保存语音文件")
with open('uploaded_audio.wav', 'ab') as f:
f.write(base64.b64decode(audio_chunks[0]))
print("服务端保存语音文件完成")
audio_chunks = [] # Clear the chunks list
emit('audio_saved', {'message': 'Audio saved successfully!'})
@socketio.on('connect')
def connected_msg():
"""socket client event - connected"""
print('客户端连接成功,client connected!')
@socketio.on('disconnect')
def disconnect_msg():
"""socket client event - disconnected"""
print('客户端断开连接,client disconnected!')
if __name__ == '__main__':
socketio.run(app, port=8200, debug=True)
前端html及JavaScript代码如下:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Audio Stream Upload</title>
<script src="https://cdn.socket.io/4.0.0/socket.io.min.js"></script>
</head>
<body>
<h1>Upload Audio Stream</h1>
<button id="start-recording">Start Recording</button>
<button id="stop-recording" disabled>Stop Recording</button>
<p id="status"></p>
<script>
const socket = io.connect('http://localhost:8200');
let mediaRecorder;
let audioChunks = [];
document.getElementById('start-recording').addEventListener('click', async () => {
const stream = await navigator.mediaDevices.getUserMedia({ audio: true });
mediaRecorder = new MediaRecorder(stream);
mediaRecorder.ondataavailable = event => {
audioChunks.push(event.data);
// Send the chunk to the server (convert to ArrayBuffer first)
const buffer = event.data.arrayBuffer();
buffer.then(arrayBuffer => {
const base64String = btoa(String.fromCharCode(...new Uint8Array(arrayBuffer)));
socket.emit('audio_chunk', base64String);
});
};
mediaRecorder.start();
document.getElementById('start-recording').disabled = true;
document.getElementById('stop-recording').disabled = false;
document.getElementById('status').textContent = 'Recording...';
mediaRecorder.onstop = () => {
// Inform the server that the audio stream has ended
socket.emit('audio_end');
document.getElementById('start-recording').disabled = false;
document.getElementById('stop-recording').disabled = true;
document.getElementById('status').textContent = 'Recording stopped. Waiting for server response...';
};
});
document.getElementById('stop-recording').addEventListener('click', () => {
mediaRecorder.stop();
});
socket.on('audio_saved', data => {
document.getElementById('status').textContent = data.message;
});
</script>
</body>
</html>
启动python服务,浏览器访问http://localhost:8200/就可以看到如下网页:
3、语音识别接口
语音识别算法这里选择openai的开源项目:whisper,项目地址:https://github.com/openai/whisper
- 安装
pip install -U openai-whisper
还需要在终端安装ffmpeg,sudo apt update && sudo apt install ffmpeg
flask服务端代码如下:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = '1'
from flask import Flask, request
import io
import wave
import json
import whisper
app = Flask(__name__)
model = whisper.load_model("turbo") # or your model
@app.route('/audio_rec', methods=['POST'])
def audio_recognize():
"""
#接收语音文件并用whisper语音识别算法进行语音识别
#:return:
"""
f_obj = request.files.get("file", None)
if f_obj is None:
return json.dumps({'status': 1, 'msg': 'No audio was received.', 'result': ''})
else:
save_path = "temp.wav"
audio_data = f_obj.read()
with open(save_path, 'ab') as f:
f.write(audio_data) # 追加写入音频数据
result = model.transcribe(save_path)
return json.dumps({'status': 0, 'msg': '', 'result': result["text"]})
if __name__ == '__main__':
socketio.run(app, port=8200, debug=True)