GitHub - k2-fsa/sherpa-onnx: Speech-to-text, text-to-speech, speaker diarization, speech enhancement, source separation, and VAD using next-gen Kaldi with onnxruntime without Internet connection. Support embedded systems, Android, iOS, HarmonyOS, Raspberry Pi, RISC-V, x86_64 servers, websocket server/client, support 12 programming languages
https://github.com/k2-fsa/sherpa-onnxGitHub - xue-fei/sherpa-onnx-unity: sherpa-onnx-unity
https://github.com/xue-fei/sherpa-onnx-unity感谢先驱。
模型地址:vits-melo-tts-zh_en
https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-melo-tts-zh_en.tar.bz2
https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/vits-melo-tts-zh_en.tar.bz2
ReadMe:
Introduction — sherpa 1.3 documentation
https://k2-fsa.github.io/sherpa/intro.html
此存储库支持在本地运行以下功能
- 语音转文本(即 ASR);支持流式和非流式
- 文本转语音(即 TTS)
- 说话人分类
- 说话人识别
- 说话人验证
- 口语识别
- 音频标记
- VAD(例如,silero-vad)
- 语音增强(例如gtcrn)
- 关键词识别
- 源分离(例如,spleeter、UVR)

代码:
using UnityEngine;
using System.Collections;
using System.Collections.Generic;
using System;
using System.Threading;
using System.Linq;
/// <summary>
/// 多线程管理类Loom,挂载
/// </summary>
public class Loom : MonoBehaviour
{
public static int maxThreads = 8;
static int numThreads;
private int _count;
public static Loom Current;
void Awake()
{
Current = this;
}
private List<Action> _actions = new List<Action>();
public struct DelayedQueueItem
{
public float time;
public Action action;
}
private List<DelayedQueueItem> _delayed = new List<DelayedQueueItem>();
List<DelayedQueueItem> _currentDelayed = new List<DelayedQueueItem>();
public void QueueOnMainThread(Action action)
{
QueueOnMainThread(action, 0f);
}
public void QueueOnMainThread(Action action, float time)
{
if (time != 0)
{
lock (Current._delayed)
{
Current._delayed.Add(new DelayedQueueItem { time = Time.time + time, action = action });
}
}
else
{
lock (Current._actions) Current._actions.Add(action);
}
}
public Thread RunAsync(Action a)
{
while (numThreads >= maxThreads) Thread.Sleep(1);
Interlocked.Increment(ref numThreads);
ThreadPool.QueueUserWorkItem(RunAction, a);
return null;
}
private void RunAction(object action)
{
try
{
((Action)action)();
}
catch
{
}
finally
{
Interlocked.Decrement(ref numThreads);
}
}
List<Action> _currentActions = new List<Action>();
void Update()
{
lock (_actions)
{
_currentActions.Clear();
_currentActions.AddRange(_actions);
_actions.Clear();
}
foreach (var a in _currentActions)
{
a();
}
lock (_delayed)
{
_currentDelayed.Clear();
_currentDelayed.AddRange(_delayed.Where(d => d.time <= Time.time));
foreach (var item in _currentDelayed)
_delayed.Remove(item);
}
foreach (var delayed in _currentDelayed)
{
delayed.action();
}
}
}
//Loom.QueueOnMainThread(() => {//切换为主线程
// //todo 主线程的something
//});
//Loom.RunAsync(() => {//切换为C#线程
// //todo C#线程的something
//});
using SherpaOnnx;
using System.Collections.Generic;
using UnityEngine;
using AOT;
using System.IO;
using System;
using System.Runtime.InteropServices;
[RequireComponent(typeof(AudioSource))]
public class Sherpa_TextToSpeech : MonoBehaviour
{
public static Sherpa_TextToSpeech Instance;
private OfflineTts ot;
private OfflineTtsGeneratedAudio otga;
private OfflineTtsConfig config;
private OfflineTtsCallback otc;
private AudioSource audioSource;
private AudioClip audioClip = null;
private int sampleRate = 22050;
private List<float> audioData = new List<float>();
private int curAudioClipPos = 0;
public float audioLength = 0f;
public bool initDone { get; private set; } = false;
#region PathDefine
private readonly string modelFile = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/model.onnx";
private readonly string lexiconFile = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/lexicon.txt";
private readonly string tokensFile = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/tokens.txt";
private readonly string dictDir = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/dict";
private readonly string phone_fst = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/phone.fst";
private readonly string date_fst = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/date.fst";
private readonly string number_fst = $"{Application.streamingAssetsPath}/models/vits-melo-tts-zh_en/number.fst";
#endregion
private void Awake()
{
Instance = this;
audioSource = GetComponent<AudioSource>();
audioSource.loop = true;
audioSource.playOnAwake = false;
DontDestroyOnLoad(gameObject);
}
private void Start()
{
initDone = false;
Debug.Log("开始初始化模型,请等待...");
Loom.Current.RunAsync(Init);
}
private void Init()
{
if (!FileCheck()) return;
try
{
config = new OfflineTtsConfig
{
Model =
{
Vits =
{
Model = modelFile,
Lexicon = lexiconFile,
Tokens = tokensFile,
DictDir = dictDir,
NoiseScale = 0.667f,
NoiseScaleW = 0.8f,
LengthScale = 1f
},
NumThreads = 5,
Debug = 1,
Provider = "cpu"
},
RuleFsts = $"{phone_fst},{date_fst},{number_fst}",
MaxNumSentences = 1
};
ot = new OfflineTts(config);
sampleRate = ot.SampleRate;
otc = new OfflineTtsCallback(StaticOnAudioData);
initDone = true;
Loom.Current.QueueOnMainThread(() =>
Debug.Log("文字转语音初始化完成"));
}
catch (Exception e)
{
Loom.Current.QueueOnMainThread(() =>
Debug.LogError("初始化文字转语音时发生错误: " + e.Message));
}
}
private bool FileCheck()
{
if (!File.Exists(modelFile))
{
Debug.LogError("模型文件不存在: " + modelFile);
return false;
}
if (!File.Exists(lexiconFile))
{
Debug.LogError("词典文件不存在: " + lexiconFile);
return false;
}
if (!File.Exists(tokensFile))
{
Debug.LogError("tokens文件不存在: " + tokensFile);
return false;
}
if (!Directory.Exists(dictDir))
{
Debug.LogError("字典目录不存在: " + dictDir);
return false;
}
return true;
}
[MonoPInvokeCallback(typeof(OfflineTtsCallback))]
private static int StaticOnAudioData(IntPtr samples, int n)
{
return Instance?.OnAudioData(samples, n) ?? 0;
}
private int OnAudioData(IntPtr samples, int n)
{
if (n <= 0)
{
Loom.Current.QueueOnMainThread(() =>
Debug.LogWarning("收到空的音频数据"));
return 0;
}
float[] tempData = new float[n];
Marshal.Copy(samples, tempData, 0, n);
lock (audioData)
{
audioData.AddRange(tempData);
}
Loom.Current.QueueOnMainThread(() =>
{
Debug.Log($"收到音频数据,长度: {n}");
audioLength += n / (float)sampleRate;
Debug.Log($"音频长度增加 {n / (float)sampleRate} 秒");
if (!audioSource.isPlaying && audioData.Count > sampleRate * 2)
{
Debug.Log($"开始播放音频,数据长度: {audioData.Count}");
curAudioClipPos = 0;
audioClip = AudioClip.Create("SynthesizedAudio", sampleRate * 2, 1, sampleRate, true, OnAudioRead);
audioSource.clip = audioClip;
audioSource.Play();
}
});
return n;
}
private void OnAudioRead(float[] data)
{
ExtractAudioData(data);
}
/// <summary>
/// 提取音频数据
/// </summary>
private bool ExtractAudioData(float[] data)
{
if (data == null || data.Length == 0) return false;
bool hasData = false;
int dataIndex = 0;
lock (audioData)
{
if (audioData.Count > 0 && curAudioClipPos < audioData.Count)
{
int copyCount = Mathf.Min(data.Length, audioData.Count - curAudioClipPos);
audioData.CopyTo(curAudioClipPos, data, 0, copyCount);
curAudioClipPos += copyCount;
hasData = copyCount > 0;
dataIndex = copyCount;
}
}
// 剩余部分填0
if (dataIndex < data.Length)
Array.Clear(data, dataIndex, data.Length - dataIndex);
return hasData;
}
/// <summary>
/// 生成
/// </summary>
public void Generate(string text, float speed, int speakerId)
{
if (!initDone)
{
Debug.LogWarning("文字转语音未完成初始化");
return;
}
if (!File.Exists(modelFile))
{
Debug.LogError("模型文件不存在: " + modelFile);
return;
}
Debug.Log($"开始生成语音,文本为:{text}");
Loom.Current.RunAsync(() =>
{
try
{
Debug.Log("异步生成语音开始");
lock (audioData)
{
audioData.Clear();
curAudioClipPos = 0;
}
audioLength = 0f;
otga = ot.GenerateWithCallback(text, speed, speakerId, otc);
if (otga.SaveToWaveFile(Application.streamingAssetsPath + "/1.mp3"))
{
Debug.Log("异步生成语音结束,保存完成");
}
}
catch (Exception e)
{
Loom.Current.QueueOnMainThread(() =>
Debug.LogError("生成语音时发生错误: " + e.Message));
}
});
}
private void OnApplicationQuit()
{
ot?.Dispose();
otga?.Dispose();
otc = null;
}
}
using System.Collections;
using System.Collections.Generic;
using UnityEngine;
using UnityEngine.UI;
public class TTS_Panel : MonoBehaviour
{
public Text txt_Des;
public Button btn_Generate;
void Start()
{
btn_Generate.onClick.AddListener(() =>
{
if (string.IsNullOrEmpty(txt_Des.text) ||string.IsNullOrWhiteSpace(txt_Des.text))return;
Sherpa_TextToSpeech.Instance.Generate(txt_Des.text, 1.0f, 0);
});
}
}
1412

被折叠的 条评论
为什么被折叠?



