using System.Collections.Generic; using UnityEngine; using Unity.Sentis; using System.IO; // Jets Text-To-Speech Inference // ============================= // // This file implements the Jets Text-to-speech model in Unity Sentis // The model uses phenomes instead of raw text so you have to convert it first. // Place this file on the Main Camera // Add an audio source // Change the inputText // When running you can press space bar to play it again public class RunJets : MonoBehaviour { public string inputText = "Once upon a time, there lived a girl called Alice. She lived in a house in the woods."; //string inputText = "The quick brown fox jumped over the lazy dog"; //string inputText = "Hello, my name is Ginger the Giraffe!"; //string inputText = "There are many uses of the things she uses!"; //Set to true if we have put the phoneme_dict.txt in the Assets/StreamingAssets folder bool hasPhenomeDictionary = true; readonly string[] phonemes = new string[] { "", "", "AH0", "N", "T", "D", "S", "R", "L", "DH", "K", "Z", "IH1", "IH0", "M", "EH1", "W", "P", "AE1", "AH1", "V", "ER0", "F", "','", "AA1", "B", "HH", "IY1", "UW1", "IY0", "AO1", "EY1", "AY1", ".", "OW1", "SH", "NG", "G", "ER1", "CH", "JH", "Y", "AW1", "TH", "UH1", "EH2", "OW0", "EY2", "AO0", "IH2", "AE2", "AY2", "AA2", "UW0", "EH0", "OY1", "EY0", "AO2", "ZH", "OW2", "AE0", "UW2", "AH2", "AY0", "IY2", "AW2", "AA0", "''''", "ER2", "UH2", "'?'", "OY2", "'!'", "AW0", "UH0", "OY0", "..", "" }; readonly string[] alphabet = "AE1 B K D EH1 F G HH IH1 JH K L M N AA1 P K R S T AH1 V W K Y Z".Split(' '); //Can change pitch and speed with this for a slightly different voice: const int samplerate = 22050; Dictionary dict = new (); IWorker engine; AudioClip clip; void Start() { LoadModel(); ReadDictionary(); TextToSpeech(); } void LoadModel() { var model = ModelLoader.Load(Application.streamingAssetsPath + "/jets-text-to-speech.sentis"); engine = WorkerFactory.CreateWorker(BackendType.GPUCompute, model); } void TextToSpeech() { string ptext; if (hasPhenomeDictionary) { ptext = TextToPhonemes(inputText); Debug.Log(ptext); } else { //If we have no phenome dictionary we can use one of these examples: ptext = "DH AH0 K W IH1 K B R AW1 N F AA1 K S JH AH1 M P S OW1 V ER0 DH AH0 L EY1 Z IY0 D AO1 G ."; //ptext = "W AH1 N S AH0 P AA1 N AH0 T AY1 M , AH0 F R AA1 G M EH1 T AH0 P R IH1 N S EH0 S . DH AH0 F R AA1 G K IH1 S T DH AH0 P R IH1 N S EH0 S AH0 N D B IH0 K EY1 M AH0 P R IH1 N S ."; //ptext = "D UW1 P L AH0 K EY2 T"; } DoInference(ptext); } void ReadDictionary() { if (!hasPhenomeDictionary) return; string[] words = File.ReadAllLines(Application.streamingAssetsPath+"/phoneme_dict.txt"); for (int i = 0; i < words.Length; i++) { string s = words[i]; string[] parts = s.Split(' ', System.StringSplitOptions.RemoveEmptyEntries); if (parts[0] != ";;;") { string key = parts[0]; dict.Add(key, s.Substring(key.Length + 2)); } } // Add codes for punctuation to the dictionary dict.Add(",", "','"); dict.Add(".", "."); dict.Add("!", "'!'"); dict.Add("?", "'?'"); dict.Add("\"", "''''"); } public string ExpandNumbers(string text) { return text .Replace("0", " ZERO ") .Replace("1", " ONE ") .Replace("2", " TWO ") .Replace("3", " THREE ") .Replace("4", " FOUR ") .Replace("5", " FIVE ") .Replace("6", " SIX ") .Replace("7", " SEVEN ") .Replace("8", " EIGHT ") .Replace("9", " NINE "); } public string TextToPhonemes(string text) { string output = ""; text = ExpandNumbers(text).ToUpper(); string[] words = text.Split(); for (int i = 0; i < words.Length; i++) { output += DecodeWord(words[i]); } return output; } //Decode the word into phenomes by looking for the longest word in the dictionary that matches //the first part of the word and so on. //This is works fairly well but could be improved. The original paper had a model that //dealt with guessing the phonemes of words public string DecodeWord(string word) { string output = ""; int start = 0; for (int i = word.Length; i >= 0; i--) { string subword = word.Substring(start, i - start); if (dict.TryGetValue(subword, out string value)) { output += value + " "; if (i == word.Length) break; start = i; i = word.Length + 1; } } return output; } int[] GetTokens(string ptext) { string[] p = ptext.Split(); var tokens = new int[p.Length]; for (int i = 0; i < tokens.Length; i++) { tokens[i] = Mathf.Max(0, System.Array.IndexOf(phonemes, p[i])); } return tokens; } public void DoInference(string ptext) { int[] tokens = GetTokens(ptext); using var input = new TensorInt(new TensorShape(tokens.Length), tokens); var result = engine.Execute(input); var output = result.PeekOutput("wav") as TensorFloat; output.MakeReadable(); var samples = output.ToReadOnlyArray(); Debug.Log($"Audio size = {samples.Length / samplerate} seconds"); clip = AudioClip.Create("voice audio", samples.Length, 1, samplerate, false); clip.SetData(samples, 0); Speak(); } private void Speak() { AudioSource audioSource = GetComponent(); if (audioSource != null) { audioSource.clip = clip; audioSource.Play(); } else { Debug.Log("There is no audio source"); } } void Update() { if (Input.GetKeyDown(KeyCode.Space)) { TextToSpeech(); } } private void OnDestroy() { engine?.Dispose(); } }