unity 实现文字转语音TTS-Sherpa-onnx

原创已于 2025-07-29 14:10:52 修改 · 1.4k 阅读

15 ·

CC 4.0 BY-SA版权

文章标签：

#unity #离线语音 #文字转语音 #sherpa-onnx

于 2025-07-29 14:03:43 首次发布

sherpa-onnx-unity 专栏收录该内容

2 篇文章

订阅专栏

GitHub - k2-fsa/sherpa-onnx: Speech-to-text, text-to-speech, speaker diarization, speech enhancement, source separation, and VAD using next-gen Kaldi with onnxruntime without Internet connection. Support embedded systems, Android, iOS, HarmonyOS, Raspberry Pi, RISC-V, x86_64 servers, websocket server/client, support 12 programming languageshttps://github.com/k2-fsa/sherpa-onnx GitHub - xue-fei/sherpa-onnx-unity: sherpa-onnx-unityhttps://github.com/xue-fei/sherpa-onnx-unity感谢先驱。

模型地址：vits-melo-tts-zh_en
https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-melo-tts-zh_en.tar.bz2https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-melo-tts-zh_en.tar.bz2

ReadMe：

Introduction — sherpa 1.3 documentationhttps://k2-fsa.github.io/sherpa/intro.html

此存储库支持在本地运行以下功能

语音转文本（即 ASR）；支持流式和非流式
文本转语音（即 TTS）
说话人分类
说话人识别
说话人验证
口语识别
音频标记
VAD（例如，silero-vad）
语音增强（例如gtcrn）
关键词识别
源分离（例如，spleeter、UVR）

代码：

using UnityEngine;
using System.Collections;
using System.Collections.Generic;
using System;
using System.Threading;
using System.Linq;

/// <summary>
/// 多线程管理类Loom,挂载
/// </summary>
public class Loom : MonoBehaviour
{
    public static int maxThreads = 8;
    static int numThreads;
    private int _count;


    public static Loom Current;

    void Awake()
    {
        Current = this;
    }

    private List<Action> _actions = new List<Action>();
    public struct DelayedQueueItem
    {
        public float time;
        public Action action;
    }
    private List<DelayedQueueItem> _delayed = new List<DelayedQueueItem>();

    List<DelayedQueueItem> _currentDelayed = new List<DelayedQueueItem>();

    public void QueueOnMainThread(Action action)
    {
        QueueOnMainThread(action, 0f);
    }
    public void QueueOnMainThread(Action action, float time)
    {
        if (time != 0)
        {
            lock (Current._delayed)
            {
                Current._delayed.Add(new DelayedQueueItem { time = Time.time + time, action = action });
            }
        }
        else
        {
            lock (Current._actions) Current._actions.Add(action);
        }
    }

    public Thread RunAsync(Action a)
    {
        while (numThreads >= maxThreads) Thread.Sleep(1);

        Interlocked.Increment(ref numThreads);
        ThreadPool.QueueUserWorkItem(RunAction, a);
        return null;
    }
    private void RunAction(object action)
    {
        try
        {
            ((Action)action)();
        }
        catch
        {
        }
        finally
        {
            Interlocked.Decrement(ref numThreads);
        }
    }

    List<Action> _currentActions = new List<Action>();

    void Update()
    {
        lock (_actions)
        {
            _currentActions.Clear();
            _currentActions.AddRange(_actions);
            _actions.Clear();
        }
        foreach (var a in _currentActions)
        {
            a();
        }
        lock (_delayed)
        {
            _currentDelayed.Clear();
            _currentDelayed.AddRange(_delayed.Where(d => d.time <= Time.time));
            foreach (var item in _currentDelayed)
                _delayed.Remove(item);
        }
        foreach (var delayed in _currentDelayed)
        {
            delayed.action();
        }
    }
}


//Loom.QueueOnMainThread(() => {//切换为主线程
//                              //todo 主线程的something
//});

//Loom.RunAsync(() => {//切换为C#线程               
//                     //todo C#线程的something
//});

using SherpaOnnx;
using System.Collections.Generic;
using UnityEngine;
using AOT;
using System.IO;
using System;
using System.Runtime.InteropServices;

[RequireComponent(typeof(AudioSource))]
public class Sherpa_TextToSpeech : MonoBehaviour
{
    public static Sherpa_TextToSpeech Instance;

    private OfflineTts ot;
    private OfflineTtsGeneratedAudio otga;
    private OfflineTtsConfig config;
    private OfflineTtsCallback otc;
    private AudioSource audioSource;
    private AudioClip audioClip = null;
    private int sampleRate = 22050;
    private List<float> audioData = new List<float>();
    private int curAudioClipPos = 0;
    public float audioLength = 0f;
    public bool initDone { get; private set; } = false;

    #region PathDefine
    private readonly string modelFile = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/model.onnx";
    private readonly string lexiconFile = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/lexicon.txt";
    private readonly string tokensFile = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/tokens.txt";
    private readonly string dictDir = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/dict";
    private readonly string phone_fst = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/phone.fst";
    private readonly string date_fst = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/date.fst";
    private readonly string number_fst = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/number.fst";
    #endregion

    private void Awake()
    {
        Instance = this;
        audioSource = GetComponent<AudioSource>();
        audioSource.loop = true;
        audioSource.playOnAwake = false;
        DontDestroyOnLoad(gameObject);
    }

    private void Start()
    {
        initDone = false;
        Debug.Log("开始初始化模型，请等待...");
        Loom.Current.RunAsync(Init);
    }

    private void Init()
    {
        if (!FileCheck()) return;
        try
        {
            config = new OfflineTtsConfig
            {
                Model =
                {
                    Vits =
                    {
                        Model = modelFile,
                        Lexicon = lexiconFile,
                        Tokens = tokensFile,
                        DictDir = dictDir,
                        NoiseScale = 0.667f,
                        NoiseScaleW = 0.8f,
                        LengthScale = 1f
                    },
                    NumThreads = 5,
                    Debug = 1,
                    Provider = "cpu"
                },
                RuleFsts = $"{phone_fst},{date_fst},{number_fst}",
                MaxNumSentences = 1
            };

            ot = new OfflineTts(config);
            sampleRate = ot.SampleRate;
            otc = new OfflineTtsCallback(StaticOnAudioData);
            initDone = true;

            Loom.Current.QueueOnMainThread(() =>
                Debug.Log("文字转语音初始化完成"));
        }
        catch (Exception e)
        {
            Loom.Current.QueueOnMainThread(() =>
                Debug.LogError("初始化文字转语音时发生错误: " + e.Message));
        }
    }

    private bool FileCheck()
    {
        if (!File.Exists(modelFile))
        {
            Debug.LogError("模型文件不存在: " + modelFile);
            return false;
        }
        if (!File.Exists(lexiconFile))
        {
            Debug.LogError("词典文件不存在: " + lexiconFile);
            return false;
        }
        if (!File.Exists(tokensFile))
        {
            Debug.LogError("tokens文件不存在: " + tokensFile);
            return false;
        }
        if (!Directory.Exists(dictDir))
        {
            Debug.LogError("字典目录不存在: " + dictDir);
            return false;
        }
        return true;
    }

    [MonoPInvokeCallback(typeof(OfflineTtsCallback))]
    private static int StaticOnAudioData(IntPtr samples, int n)
    {
        return Instance?.OnAudioData(samples, n) ?? 0;
    }

    private int OnAudioData(IntPtr samples, int n)
    {
        if (n <= 0)
        {
            Loom.Current.QueueOnMainThread(() =>
                Debug.LogWarning("收到空的音频数据"));
            return 0;
        }

        float[] tempData = new float[n];
        Marshal.Copy(samples, tempData, 0, n);

        lock (audioData)
        {
            audioData.AddRange(tempData);
        }

        Loom.Current.QueueOnMainThread(() =>
        {
            Debug.Log($"收到音频数据，长度: {n}");
            audioLength += n / (float)sampleRate;
            Debug.Log($"音频长度增加 {n / (float)sampleRate} 秒");

            if (!audioSource.isPlaying && audioData.Count > sampleRate * 2)
            {
                Debug.Log($"开始播放音频，数据长度: {audioData.Count}");
                curAudioClipPos = 0;
                audioClip = AudioClip.Create("SynthesizedAudio", sampleRate * 2, 1, sampleRate, true, OnAudioRead);
                audioSource.clip = audioClip;
                audioSource.Play();
            }
        });

        return n;
    }

    private void OnAudioRead(float[] data)
    {
        ExtractAudioData(data);
    }

    /// <summary>
    /// 提取音频数据
    /// </summary>
    private bool ExtractAudioData(float[] data)
    {
        if (data == null || data.Length == 0) return false;

        bool hasData = false;
        int dataIndex = 0;

        lock (audioData)
        {
            if (audioData.Count > 0 && curAudioClipPos < audioData.Count)
            {
                int copyCount = Mathf.Min(data.Length, audioData.Count - curAudioClipPos);
                audioData.CopyTo(curAudioClipPos, data, 0, copyCount);
                curAudioClipPos += copyCount;
                hasData = copyCount > 0;
                dataIndex = copyCount;
            }
        }

        // 剩余部分填0
        if (dataIndex < data.Length)
            Array.Clear(data, dataIndex, data.Length - dataIndex);

        return hasData;
    }

    /// <summary>
    /// 生成
    /// </summary>
    public void Generate(string text, float speed, int speakerId)
    {
        if (!initDone)
        {
            Debug.LogWarning("文字转语音未完成初始化");
            return;
        }
        if (!File.Exists(modelFile))
        {
            Debug.LogError("模型文件不存在: " + modelFile);
            return;
        }
        Debug.Log($"开始生成语音，文本为：{text}");
        Loom.Current.RunAsync(() =>
        {
            try
            {
                Debug.Log("异步生成语音开始");
                lock (audioData)
                {
                    audioData.Clear();
                    curAudioClipPos = 0;
                }
                audioLength = 0f;
                otga = ot.GenerateWithCallback(text, speed, speakerId, otc);
                if (otga.SaveToWaveFile(Application.streamingAssetsPath + "/1.mp3"))
                {
                    Debug.Log("异步生成语音结束，保存完成");
                } 
            }
            catch (Exception e)
            {
                Loom.Current.QueueOnMainThread(() =>
                    Debug.LogError("生成语音时发生错误: " + e.Message));
            }
        });
    }

    private void OnApplicationQuit()
    {
        ot?.Dispose();
        otga?.Dispose();
        otc = null;
    }
}

using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.UI;

public class TTS_Panel : MonoBehaviour
{
    public Text txt_Des;
    public Button btn_Generate;

    void Start()
    {
        btn_Generate.onClick.AddListener(() =>
        {
            if (string.IsNullOrEmpty(txt_Des.text) ||string.IsNullOrWhiteSpace(txt_Des.text))return;

            Sherpa_TextToSpeech.Instance.Generate(txt_Des.text, 1.0f, 0);
        });
    }

}