from transformers import BartForConditionalGeneration, BartTokenizer import streamlit as st import torch from transformers import AutoProcessor, WhisperForConditionalGeneration from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torchaudio from transformers import pipeline # Load your own audio file audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3']) option_language = st.selectbox( 'Select the language of your audio', ('English', 'Spanish', 'German','French','Chinese')) if audio == None: st.write("Please upload the audio in the box above") else: if option_language == "English": def transcribe_audio(audio_file): # Load the audio file waveform, sample_rate = torchaudio.load(audio_file) # Ensure mono-channel audio if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Convert to a 16kHz sample rate if not already if sample_rate != 16000: waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) # Convert to a list of integers audio_input = waveform.squeeze().numpy().astype(int).tolist() # Use Hugging Face's ASR pipeline asr_pipeline = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h") # Transcribe the audio transcript = asr_pipeline(waveform.numpy()[0]) return transcript transcription = transcribe_audio(audio) print("Transcription",transcription) ## Inititate Summary Model tokenizer_summary = BartTokenizer.from_pretrained("facebook/bart-large-cnn") model_summary = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn") def summarize_text(text, model, tokenizer, max_length=100): input_ids = tokenizer.encode(text, return_tensors="pt") summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True) return tokenizer.decode(summary_ids[0], skip_special_tokens=True) summary = summarize_text(transcription['text'], model_summary, tokenizer_summary) st.write("Here is your summary!") st.write(summary) elif option_language == 'Spanish': def transcribe_audio(audio_file): # Load the audio file waveform, sample_rate = torchaudio.load(audio_file) # Ensure mono-channel audio if waveform.shape[0] > 1: waveform = torch.mean(waveform, dim=0, keepdim=True) # Convert to a 16kHz sample rate if not already if sample_rate != 16000: waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform) # Convert to a list of integers audio_input = waveform.squeeze().numpy().astype(int).tolist() # Use Hugging Face's ASR pipeline asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish") # Transcribe the audio transcript = asr_pipeline(waveform.numpy()[0]) return transcript transcription = transcribe_audio(audio) print("Aqui tienes tu transcripciĆ³n:",transcription) ## Inititate Summary Model tokenizer_summary = AutoTokenizer.from_pretrained("facebook/mbart-large-50", src_lang="es_XX") model_summary = AutoModelForSeq2SeqLM.from_pretrained("facebook/mbart-large-50") def summarize_text(text, model, tokenizer, max_length=100): input_ids = tokenizer.encode(text, return_tensors="pt") summary_ids = model.generate(input_ids, max_length=max_length, num_beams=4, early_stopping=True) return tokenizer.decode(summary_ids[0], skip_special_tokens=True) summary = summarize_text(transcription['text'], model_summary, tokenizer_summary) st.write("Aqui tienes tu resumen!") st.write(summary)