Spaces:

JavierGon12
/

retrAIced

Running

App Files Files Community

retrAIced / pages /Speech Recognition.py

JavierGon12

Remove unnecessary libraries and clean code a bit

cd03817 10 months ago

raw

history blame contribute delete

No virus

6.52 kB

	import streamlit as st
	import torch
	import torchaudio
	from transformers import pipeline
	from streamlit_mic_recorder import mic_recorder,speech_to_text
	import numpy as np


	option = st.selectbox("How do you want to import the audio file?",("Microphone","Upload file"))
	if option == "Microphone":
	# Load your own audio file
	st.write("Record your voice, and play the recorded audio:")
	audio = mic_recorder(start_prompt="Press the botton to start recording ⏺️",stop_prompt="Press the botton to stop to stop the recording⏹️",key='recorder')

	if audio == None:
	st.write("Please start the recording in the box above")
	else:
	st.audio(audio["bytes"])
	audio = audio['bytes']

	elif option == "Upload file":
	audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3'])
	if audio:
	st.audio(audio)

	option_language = st.selectbox(
	'Select the language of your audio',
	('English', 'Spanish', 'German','French','Chinese'))


	if audio == None:
	st.write("Please upload the audio in the box above")


	else:
	if option_language == "English":
	def transcribe_audio(audio_file):
	# Load the audio file
	waveform, sample_rate = torchaudio.load(audio_file)

	# Ensure mono-channel audio
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Convert to a 16kHz sample rate if not already
	if sample_rate != 16000:
	waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

	# Convert to a list of integers
	audio_input = waveform.squeeze().numpy().astype(int).tolist()

	# Use Hugging Face's ASR pipeline
	asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")

	# Transcribe the audio
	transcript = asr_pipeline(waveform.numpy()[0])

	return transcript

	transcription = transcribe_audio(audio)
	st.write("Here is your transcription:")
	st.write(transcription)

	elif option_language == 'Spanish':

	def transcribe_audio(audio_file):

	# Load the audio file
	waveform, sample_rate = torchaudio.load(audio_file)

	# Ensure mono-channel audio
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Convert to a 16kHz sample rate if not already
	if sample_rate != 16000:
	waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

	# Convert to a list of integers
	audio_input = waveform.squeeze().numpy().astype(int).tolist()

	# Use Hugging Face's ASR pipeline
	asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish")

	# Transcribe the audio
	transcript = asr_pipeline(waveform.numpy()[0])

	return transcript

	transcription = transcribe_audio(audio)
	st.write("Aqui tienes tu transcripcion:")
	st.write(transcription)
	elif option_language == 'German':
	def transcribe_audio(audio_file):

	# Load the audio file
	waveform, sample_rate = torchaudio.load(audio_file)

	# Ensure mono-channel audio
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Convert to a 16kHz sample rate if not already
	if sample_rate != 16000:
	waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

	# Convert to a list of integers
	audio_input = waveform.squeeze().numpy().astype(int).tolist()

	# Use Hugging Face's ASR pipeline
	asr_pipeline = pipeline("automatic-speech-recognition", model="primeline/whisper-large-v3-german")

	# Transcribe the audio
	transcript = asr_pipeline(waveform.numpy()[0])

	return transcript

	transcription = transcribe_audio(audio)
	st.write("Hier ist Ihre Transkription:")
	st.write(transcription)
	elif option_language == "French":
	def transcribe_audio(audio_file):

	# Load the audio file
	waveform, sample_rate = torchaudio.load(audio_file)

	# Ensure mono-channel audio
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Convert to a 16kHz sample rate if not already
	if sample_rate != 16000:
	waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

	# Convert to a list of integers
	audio_input = waveform.squeeze().numpy().astype(int).tolist()

	# Use Hugging Face's ASR pipeline
	asr_pipeline = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-large-v2-french")

	# Transcribe the audio
	transcript = asr_pipeline(waveform.numpy()[0])

	return transcript

	transcription = transcribe_audio(audio)
	st.write("Ici, vous avez votre transcription")
	st.write(transcription)

	elif option_language == "Chinese":
	def transcribe_audio(audio_file):

	# Load the audio file
	waveform, sample_rate = torchaudio.load(audio_file)

	# Ensure mono-channel audio
	if waveform.shape[0] > 1:
	waveform = torch.mean(waveform, dim=0, keepdim=True)

	# Convert to a 16kHz sample rate if not already
	if sample_rate != 16000:
	waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)

	# Convert to a list of integers
	audio_input = waveform.squeeze().numpy().astype(int).tolist()

	# Use Hugging Face's ASR pipeline
	asr_pipeline = pipeline("automatic-speech-recognition", model="yi-ching/whisper-tiny-chinese-test")

	# Transcribe the audio
	transcript = asr_pipeline(waveform.numpy()[0])

	return transcript

	transcription = transcribe_audio(audio)
	st.write("这是您的转录。")
	st.write(transcription)