from transformers import BartForConditionalGeneration, BartTokenizer
import streamlit as st
import torch
from transformers import AutoProcessor, WhisperForConditionalGeneration
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torchaudio
from transformers import pipeline
from streamlit_mic_recorder import mic_recorder,speech_to_text
import numpy as np


option = st.selectbox("How do you want to import the audio file?",("Microphone","Upload file"))
if option == "Microphone":
# Load your own audio file
    st.write("Record your voice, and play the recorded audio:")
    audio = mic_recorder(start_prompt="Press the botton to start recording ⏺️",stop_prompt="Press the botton to stop to stop the recording⏹️",key='recorder')
    
    if audio == None:
        st.write("Please start the recording in the box above")
    else:       
        st.audio(audio["bytes"])
        audio = audio['bytes']

elif option == "Upload file":
    audio = st.file_uploader(label="Upload your audio file here",type=["wav",'mp3'])
    if audio:
        st.audio(audio)

option_language = st.selectbox(
    'Select the language of your audio',
    ('English', 'Spanish', 'German','French','Chinese'))


if audio == None:
    st.write("Please upload the audio in the box above")


else:
    if option_language == "English":
        def transcribe_audio(audio_file):
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="openai/whisper-large-v2")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript

        transcription = transcribe_audio(audio)
        st.write("Here is your transcription:")
        st.write(transcription)

    elif option_language == 'Spanish':

        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="Sandiago21/whisper-large-v2-spanish")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("Aqui tienes tu transcripcion:")
        st.write(transcription)
    elif option_language == 'German':
        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="primeline/whisper-large-v3-german")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("Hier ist Ihre Transkription:")
        st.write(transcription)
    elif option_language == "French":
        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="bofenghuang/whisper-large-v2-french")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("Ici, vous avez votre transcription")
        st.write(transcription)

    elif option_language == "Chinese":
        def transcribe_audio(audio_file):
            
            # Load the audio file
            waveform, sample_rate = torchaudio.load(audio_file)

            # Ensure mono-channel audio
            if waveform.shape[0] > 1:
                waveform = torch.mean(waveform, dim=0, keepdim=True)

            # Convert to a 16kHz sample rate if not already
            if sample_rate != 16000:
                waveform = torchaudio.transforms.Resample(sample_rate, 16000)(waveform)
            
            # Convert to a list of integers
            audio_input = waveform.squeeze().numpy().astype(int).tolist()

            # Use Hugging Face's ASR pipeline
            asr_pipeline = pipeline("automatic-speech-recognition", model="yi-ching/whisper-tiny-chinese-test")
            
            # Transcribe the audio
            transcript = asr_pipeline(waveform.numpy()[0])

            return transcript
        
        transcription = transcribe_audio(audio)
        st.write("这是您的转录。")
        st.write(transcription)