carlosdanielhernandezmena
commited on
Commit
•
1ea6594
1
Parent(s):
478b2d8
Changing the variable sentence
Browse files
README.md
CHANGED
@@ -148,7 +148,7 @@ ds=load_dataset("ciempiess/ciempiess_test", split="test")
|
|
148 |
import re
|
149 |
chars_to_ignore_regex = '[\\,\\?\\.\\!\\\;\\:\\"\\“\\%\\‘\\”\\�\\)\\(\\*)]'
|
150 |
def remove_special_characters(batch):
|
151 |
-
batch["
|
152 |
return batch
|
153 |
ds = ds.map(remove_special_characters)
|
154 |
#Downsample to 16kHz
|
@@ -159,7 +159,7 @@ def prepare_dataset(batch):
|
|
159 |
#Batched output is "un-batched" to ensure mapping is correct
|
160 |
batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
|
161 |
with processor.as_target_processor():
|
162 |
-
batch["labels"] = processor(batch["
|
163 |
return batch
|
164 |
ds = ds.map(prepare_dataset, remove_columns=ds.column_names,num_proc=1)
|
165 |
#Define the evaluation metric
|
@@ -182,11 +182,11 @@ def map_to_result(batch):
|
|
182 |
logits = model(input_values).logits
|
183 |
pred_ids = torch.argmax(logits, dim=-1)
|
184 |
batch["pred_str"] = processor.batch_decode(pred_ids)[0]
|
185 |
-
batch["
|
186 |
return batch
|
187 |
results = ds.map(map_to_result,remove_columns=ds.column_names)
|
188 |
#Compute the overall WER now.
|
189 |
-
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["
|
190 |
|
191 |
```
|
192 |
**Test Result**: 0.112
|
|
|
148 |
import re
|
149 |
chars_to_ignore_regex = '[\\,\\?\\.\\!\\\;\\:\\"\\“\\%\\‘\\”\\�\\)\\(\\*)]'
|
150 |
def remove_special_characters(batch):
|
151 |
+
batch["normalized_text"] = re.sub(chars_to_ignore_regex, '', batch["normalized_text"]).lower()
|
152 |
return batch
|
153 |
ds = ds.map(remove_special_characters)
|
154 |
#Downsample to 16kHz
|
|
|
159 |
#Batched output is "un-batched" to ensure mapping is correct
|
160 |
batch["input_values"] = processor(audio["array"], sampling_rate=audio["sampling_rate"]).input_values[0]
|
161 |
with processor.as_target_processor():
|
162 |
+
batch["labels"] = processor(batch["normalized_text"]).input_ids
|
163 |
return batch
|
164 |
ds = ds.map(prepare_dataset, remove_columns=ds.column_names,num_proc=1)
|
165 |
#Define the evaluation metric
|
|
|
182 |
logits = model(input_values).logits
|
183 |
pred_ids = torch.argmax(logits, dim=-1)
|
184 |
batch["pred_str"] = processor.batch_decode(pred_ids)[0]
|
185 |
+
batch["normalized_text"] = processor.decode(batch["labels"], group_tokens=False)
|
186 |
return batch
|
187 |
results = ds.map(map_to_result,remove_columns=ds.column_names)
|
188 |
#Compute the overall WER now.
|
189 |
+
print("Test WER: {:.3f}".format(wer_metric.compute(predictions=results["pred_str"], references=results["normalized_text"])))
|
190 |
|
191 |
```
|
192 |
**Test Result**: 0.112
|