Upload 6 files

Browse files

Files changed (6) hide show

AudioDecoder_Tiny.sentis +2 -2
AudioEncoder_Tiny.sentis +2 -2
LogMelSepctro.sentis +2 -2
README.md +3 -3
RunWhisper.cs +15 -19
info.json +1 -1

AudioDecoder_Tiny.sentis CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f6d24553eda46f335ead8ba30e3970fc8056086a538047248821aa31a135f938
-size 198832845

 version https://git-lfs.github.com/spec/v1
+oid sha256:e213397b356d02117ba9489a717c9ff1402175c55ab8882800affa595079768a
+size 198748952

AudioEncoder_Tiny.sentis CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d3fb532b04b438079db8de9551a0d813da22be5fd05cdeeff3d09794492ca5b1
-size 32888514

 version https://git-lfs.github.com/spec/v1
+oid sha256:e7da4d76dcbd84659f22e744a89ef7916a75a873415fac953459384ee7d4b457
+size 32860344

LogMelSepctro.sentis CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e021007141fdf2d39113ea1aa12bc258226ea1c2976171544f3a05979e2b69ef
-size 1360848

 version https://git-lfs.github.com/spec/v1
+oid sha256:4f1d5bf692d1e8bfb225b493386614f16f1a7c71bc68a5d3106b79793640a8ab
+size 1353668

README.md CHANGED Viewed

@@ -4,14 +4,14 @@ library_name: unity-sentis
 pipeline_tag: automatic-speech-recognition
 ---
-# Whisper-Tiny model in Unity Sentis (Version 1.3.0-pre.3*)
-*Version 1.3.0 Sentis files are not compatible with 1.4.0 and above and need to be recreated
 This is the [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) model tested to work in Unity 2023. It is a speech-to-text model. You feed in a 16kHz wav file and it outputs the best guess for what was said in the audio.
 ## How to Use
 * Open a new scene in Unity 2023
-* Import package ``com.unity.sentis`` version `1.3.0-pre.3` from the package manager.
 * Put the `RunWhisper.cs` on the Main Camera
 * Put the *.sentis files and the `vocab.json` in the Assets/StreamingAssets folder
 * Add a 16kHz mono audio file up to 30 seconds long to your project and drag on to the audioClip field.

 pipeline_tag: automatic-speech-recognition
 ---
+# Whisper-Tiny model in Unity Sentis (Version 1.4.0-pre.2*)
+(*Sentis files from 1.3.0 and earlier will not be compatible and would need to be recreated.)
 This is the [Whisper Tiny](https://huggingface.co/openai/whisper-tiny) model tested to work in Unity 2023. It is a speech-to-text model. You feed in a 16kHz wav file and it outputs the best guess for what was said in the audio.
 ## How to Use
 * Open a new scene in Unity 2023
+* Import package ``com.unity.sentis`` version `1.4.0-pre.2` from the package manager.
 * Put the `RunWhisper.cs` on the Main Camera
 * Put the *.sentis files and the `vocab.json` in the Assets/StreamingAssets folder
 * Add a 16kHz mono audio file up to 30 seconds long to your project and drag on to the audioClip field.

RunWhisper.cs CHANGED Viewed

@@ -49,10 +49,6 @@ public class RunWhisper : MonoBehaviour
     const int TRANSLATE = 50358;  //for speech-to-text then translate to English
     const int NO_TIME_STAMPS = 50363;
     const int START_TIME = 50364;
-    Ops ops;
-    ITensorAllocator allocator;
     int numSamples;
     float[] data;
@@ -74,18 +70,22 @@ public class RunWhisper : MonoBehaviour
     void Start()
     {
-        allocator = new TensorCachingAllocator();
-        ops = WorkerFactory.CreateOps(backend, allocator);
         SetupWhiteSpaceShifts();
         GetTokens();
         Model decoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioDecoder_Tiny.sentis");
         Model encoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioEncoder_Tiny.sentis");
         Model spectro = ModelLoader.Load(Application.streamingAssetsPath + "/LogMelSepctro.sentis");
-        decoderEngine = WorkerFactory.CreateWorker(backend, decoder);
         encoderEngine = WorkerFactory.CreateWorker(backend, encoder);
         spectroEngine = WorkerFactory.CreateWorker(backend, spectro);
@@ -116,7 +116,9 @@ public class RunWhisper : MonoBehaviour
             return;
         }
-        data = new float[numSamples];
         audioClip.GetData(data, 0);
     }
@@ -136,10 +138,7 @@ public class RunWhisper : MonoBehaviour
     {
         using var input = new TensorFloat(new TensorShape(1, numSamples), data);
-        // Pad out to 30 seconds at 16khz if necessary
-        using var input30seconds = ops.Pad(input, new int[] { 0, 0, 0, maxSamples - numSamples });
-        spectroEngine.Execute(input30seconds);
         var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
         encoderEngine.Execute(spectroOutput);
@@ -156,15 +155,14 @@ public class RunWhisper : MonoBehaviour
             var inputs = new Dictionary<string, Tensor>
             {
-                {"encoded_audio",encodedAudio },
-                {"tokens" , tokensSoFar }
             };
             decoderEngine.Execute(inputs);
-            var tokensOut = decoderEngine.PeekOutput() as TensorFloat;
-            using var tokensPredictions = ops.ArgMax(tokensOut, 2, false);
-            tokensPredictions.MakeReadable();
             int ID = tokensPredictions[currentToken];
@@ -225,7 +223,5 @@ public class RunWhisper : MonoBehaviour
         decoderEngine?.Dispose();
         encoderEngine?.Dispose();
         spectroEngine?.Dispose();
-        ops?.Dispose();
-        allocator?.Dispose();
     }
 }

     const int TRANSLATE = 50358;  //for speech-to-text then translate to English
     const int NO_TIME_STAMPS = 50363;
     const int START_TIME = 50364;
     int numSamples;
     float[] data;
     void Start()
     {
         SetupWhiteSpaceShifts();
         GetTokens();
         Model decoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioDecoder_Tiny.sentis");
+        Model decoderWithArgMax = Functional.Compile(
+            (tokens, audio) => Functional.ArgMax(decoder.Forward(tokens, audio)[0], 2),
+            (decoder.inputs[0], decoder.inputs[1])
+        );
         Model encoder = ModelLoader.Load(Application.streamingAssetsPath + "/AudioEncoder_Tiny.sentis");
         Model spectro = ModelLoader.Load(Application.streamingAssetsPath + "/LogMelSepctro.sentis");
+        decoderEngine = WorkerFactory.CreateWorker(backend, decoderWithArgMax);
         encoderEngine = WorkerFactory.CreateWorker(backend, encoder);
         spectroEngine = WorkerFactory.CreateWorker(backend, spectro);
             return;
         }
+        data = new float[maxSamples];
+        numSamples = maxSamples;
+        //We will get a warning here if data.length is larger than audio length but that is OK
         audioClip.GetData(data, 0);
     }
     {
         using var input = new TensorFloat(new TensorShape(1, numSamples), data);
+        spectroEngine.Execute(input);
         var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
         encoderEngine.Execute(spectroOutput);
             var inputs = new Dictionary<string, Tensor>
             {
+                {"input_0", tokensSoFar },
+                {"input_1", encodedAudio }
             };
             decoderEngine.Execute(inputs);
+            var tokensPredictions = decoderEngine.PeekOutput() as TensorInt;
+            tokensPredictions.CompleteOperationsAndDownload();
             int ID = tokensPredictions[currentToken];
         decoderEngine?.Dispose();
         encoderEngine?.Dispose();
         spectroEngine?.Dispose();
     }
 }

info.json CHANGED Viewed

@@ -11,6 +11,6 @@
         "vocab.json"
     ],
     "version" : [
-         "1.3.0-pre.3"
     ]
 }

         "vocab.json"
     ],
     "version" : [
+         "1.4.0"
     ]
 }