核心需求: 用 gradio 和 RTC 演示语音端到端, 需要同时生成语音对话内容和结构化数据.
需要用到 AdditionalOutputs
, 并给WebRTC设置on_additional_outputs
.
正常情况, 返回音频需要 (rate, audio_data)
, 改成 (rate, audio_data), AdditionalOutputs(***)
.
代码大致示例
from gradio_webrtc import AsyncStreamHandler, WebRTC, async_aggregate_bytes_to_16bit, AdditionalOutputs
import gradio as gr
class StreamHandler(AsyncStreamHandler):
def __init__(self, *args, **kwargs):
self.convo = []
self.gradio_convo = []
async def emit(self):
msg = 'hello world ~~'
message = {'role': 'assistant', 'content': msg}
self.convo.append(message)
self.gradio_convo.append(message)
return (self.output_sample_rate, audio_array), AdditionalOutputs(self.convo, self.gradio_convo)
with gr.Row(visible=False) as row:
convo = gr.State(value=[])
webrtc = WebRTC(
label="Conversation",
modality="audio",
mode="send-receive",
)
with gr.Column():
transcript = gr.Chatbot(label="transcript", type="messages")
webrtc.stream(
StreamHandler(),
inputs=[webrtc],
outputs=[webrtc, webrtc],
time_limit=90,
concurrency_limit=2,
)
def update_transcript(convo, transcript):
#convo.append(msg)
#transcript.append(msg)
return convo, transcript
webrtc.on_additional_outputs(
update_transcript,
outputs=[convo, transcript],
queue=True, show_progress="hidden"
)