Paul Bird commited on
Commit
fd92562
1 Parent(s): 4d10ff7

Upload RunWhisper.cs

Browse files
Files changed (1) hide show
  1. RunWhisper.cs +18 -12
RunWhisper.cs CHANGED
@@ -36,6 +36,7 @@ public class RunWhisper : MonoBehaviour
36
  // Link your audioclip here. Format must be 16Hz mono non-compressed.
37
  public AudioClip audioClip;
38
 
 
39
  const int maxTokens = 100;
40
 
41
  //Special tokens
@@ -56,19 +57,22 @@ public class RunWhisper : MonoBehaviour
56
  int[] outputTokens = new int[maxTokens];
57
 
58
  // Used for special character decoding
59
- int[] shiftDownDict = new int[256];
60
 
61
  TensorFloat encodedAudio;
62
 
63
  bool transcribe = false;
64
  string outputString = "";
65
 
 
 
 
66
  void Start()
67
  {
68
  allocator = new TensorCachingAllocator();
69
  ops = WorkerFactory.CreateOps(backend, allocator);
70
 
71
- SetupCharacterShifts();
72
 
73
  GetTokens();
74
 
@@ -117,9 +121,7 @@ public class RunWhisper : MonoBehaviour
117
 
118
  void EncodeAudio()
119
  {
120
- var input = new TensorFloat(new TensorShape(1, numSamples), data);
121
-
122
- int maxSamples = 30 * 16000;
123
  if (numSamples > maxSamples)
124
  {
125
  Debug.Log("The AudioClip is too long.");
@@ -127,7 +129,7 @@ public class RunWhisper : MonoBehaviour
127
  }
128
 
129
  // Pad out to 30 seconds at 16khz if necessary
130
- var input30seconds = ops.Pad(input, new int[] { 0, 0, 0, 30 * 16000 - numSamples });
131
 
132
  spectroEngine.Execute(input30seconds);
133
  var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
@@ -142,7 +144,7 @@ public class RunWhisper : MonoBehaviour
142
  {
143
  if (transcribe && currentToken < outputTokens.Length - 1)
144
  {
145
- var tokensSoFar = new TensorInt(new TensorShape(1, outputTokens.Length), outputTokens);
146
 
147
  var inputs = new Dictionary<string, Tensor>
148
  {
@@ -153,7 +155,7 @@ public class RunWhisper : MonoBehaviour
153
  decoderEngine.Execute(inputs);
154
  var tokensOut = decoderEngine.PeekOutput() as TensorFloat;
155
 
156
- var tokensPredictions = ops.ArgMax(tokensOut, 2, false);
157
  tokensPredictions.MakeReadable();
158
 
159
  int ID = tokensPredictions[currentToken];
@@ -165,7 +167,10 @@ public class RunWhisper : MonoBehaviour
165
  {
166
  transcribe = false;
167
  }
168
- else if (ID >= tokens.Length) outputString += $"(time={(ID - START_TIME) * 0.02f})";
 
 
 
169
  else outputString += GetUnicodeText(tokens[ID]);
170
 
171
  Debug.Log(outputString);
@@ -185,16 +190,16 @@ public class RunWhisper : MonoBehaviour
185
  foreach (char letter in text)
186
  {
187
  outText += ((int)letter <= 256) ? letter :
188
- (char)shiftDownDict[(int)(letter - 256)];
189
  }
190
  return outText;
191
  }
192
 
193
- void SetupCharacterShifts()
194
  {
195
  for (int i = 0, n = 0; i < 256; i++)
196
  {
197
- if (IsWhiteSpace((char)i)) shiftDownDict[n++] = i;
198
  }
199
  }
200
 
@@ -209,5 +214,6 @@ public class RunWhisper : MonoBehaviour
209
  encoderEngine?.Dispose();
210
  spectroEngine?.Dispose();
211
  ops?.Dispose();
 
212
  }
213
  }
 
36
  // Link your audioclip here. Format must be 16Hz mono non-compressed.
37
  public AudioClip audioClip;
38
 
39
+ // This is how many tokens you want. It can be adjusted.
40
  const int maxTokens = 100;
41
 
42
  //Special tokens
 
57
  int[] outputTokens = new int[maxTokens];
58
 
59
  // Used for special character decoding
60
+ int[] whiteSpaceCharacters = new int[256];
61
 
62
  TensorFloat encodedAudio;
63
 
64
  bool transcribe = false;
65
  string outputString = "";
66
 
67
+ // Maximum size of audioClip (30s at 16kHz)
68
+ const int maxSamples = 30 * 16000;
69
+
70
  void Start()
71
  {
72
  allocator = new TensorCachingAllocator();
73
  ops = WorkerFactory.CreateOps(backend, allocator);
74
 
75
+ SetupWhiteSpaceShifts();
76
 
77
  GetTokens();
78
 
 
121
 
122
  void EncodeAudio()
123
  {
124
+ using var input = new TensorFloat(new TensorShape(1, numSamples), data);
 
 
125
  if (numSamples > maxSamples)
126
  {
127
  Debug.Log("The AudioClip is too long.");
 
129
  }
130
 
131
  // Pad out to 30 seconds at 16khz if necessary
132
+ using var input30seconds = ops.Pad(input, new int[] { 0, 0, 0, maxSamples - numSamples });
133
 
134
  spectroEngine.Execute(input30seconds);
135
  var spectroOutput = spectroEngine.PeekOutput() as TensorFloat;
 
144
  {
145
  if (transcribe && currentToken < outputTokens.Length - 1)
146
  {
147
+ using var tokensSoFar = new TensorInt(new TensorShape(1, outputTokens.Length), outputTokens);
148
 
149
  var inputs = new Dictionary<string, Tensor>
150
  {
 
155
  decoderEngine.Execute(inputs);
156
  var tokensOut = decoderEngine.PeekOutput() as TensorFloat;
157
 
158
+ using var tokensPredictions = ops.ArgMax(tokensOut, 2, false);
159
  tokensPredictions.MakeReadable();
160
 
161
  int ID = tokensPredictions[currentToken];
 
167
  {
168
  transcribe = false;
169
  }
170
+ else if (ID >= tokens.Length)
171
+ {
172
+ outputString += $"(time={(ID - START_TIME) * 0.02f})";
173
+ }
174
  else outputString += GetUnicodeText(tokens[ID]);
175
 
176
  Debug.Log(outputString);
 
190
  foreach (char letter in text)
191
  {
192
  outText += ((int)letter <= 256) ? letter :
193
+ (char)whiteSpaceCharacters[(int)(letter - 256)];
194
  }
195
  return outText;
196
  }
197
 
198
+ void SetupWhiteSpaceShifts()
199
  {
200
  for (int i = 0, n = 0; i < 256; i++)
201
  {
202
+ if (IsWhiteSpace((char)i)) whiteSpaceCharacters[n++] = i;
203
  }
204
  }
205
 
 
214
  encoderEngine?.Dispose();
215
  spectroEngine?.Dispose();
216
  ops?.Dispose();
217
+ allocator?.Dispose();
218
  }
219
  }