File size: 4,691 Bytes
52f6db2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e64c024
52f6db2
 
e64c024
52f6db2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b6fce1d
 
 
c0a316a
b6fce1d
 
 
52f6db2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5f4379d
52f6db2
 
 
 
 
 
 
 
 
 
 
 
 
56494d7
52f6db2
 
 
 
 
 
 
 
 
 
 
 
 
 
6beeacc
52f6db2
6beeacc
52f6db2
b6fce1d
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
this model only supports english since text to speech is an english only model
"""
from google.cloud import texttospeech
import os
import openai
import gradio as gr
from dotenv import load_dotenv
import pinecone

"""
login to gcp
"""
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gcp_access_key.json"
# Instantiates a client
client = texttospeech.TextToSpeechClient()

""" 
Connecting to Open AI API
"""
load_dotenv()
openai.organization = os.getenv("OPENAI_ORG")
openai.api_key = os.getenv("OPENAI_API_KEY")
EMBEDDING_MODEL = "text-embedding-ada-002"
"""
Connecting to pincone API and assign index
"""
index_name = 'economic-forecast'
pinecone.init(
    api_key=os.getenv("Pinecone_KEY"),
    environment=os.getenv("Pinecone_ENV")
)

## initial a first message to define GPT's role


"""
define the text -> speech function
"""
def text2speech(text):

    # Set the text input to be synthesized
    synthesis_input = texttospeech.SynthesisInput(text=text)

    # Build the voice request, select the language code ("en-US") and the ssml
    # voice gender ("neutral")
    voice = texttospeech.VoiceSelectionParams(
        language_code="en-US", name="en-US-News-K", ssml_gender=texttospeech.SsmlVoiceGender.FEMALE
    )

    # Select the type of audio file you want returned
    audio_config = texttospeech.AudioConfig(
        audio_encoding=texttospeech.AudioEncoding.MP3
    )

    # Perform the text-to-speech request on the text input with the selected
    # voice parameters and audio file type
    response = client.synthesize_speech(
        input=synthesis_input, voice=voice, audio_config=audio_config
    )
    # The response's audio_content is binary.
    with open("output.mp3", "wb") as out:
        # Write the response to the output file.
        out.write(response.audio_content)
        print('Audio content written to file "output.mp3"')

"""
define voice -> gpt -> text -> voice workflow
"""
def transcribe(audio):
    reset_chat_history()
    voice_path = get_response(audio)
    messages = get_response(audio, return_messages=True)
    chat_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])
    with open(voice_path, 'rb') as f:
        voice_bytes = f.read()
    return voice_bytes, chat_text
    #global messages

    """
    gradio output file doesn't have .wav so rename the file to the correct format
    """
    extension = ".wav"
    audiofomated = f"{audio}{extension}"
    os.rename(audio,audiofomated) 

    """
    pass the audio file to whisper to transcribe

    """
    audio_file = open(audiofomated, "rb")
    transcript = openai.Audio.transcribe("whisper-1", audio_file)

    
    """
    run cosin similarity to find context
    """
    ### Input the question and search for the relavent text
    index = pinecone.Index(index_name)
    query = openai.Embedding.create(input=transcript["text"], model=EMBEDDING_MODEL)["data"][0]["embedding"] # embed the user query into an embedding vector
    res = index.query(query, top_k=3, include_metadata=True) # run cosin similarity to search the most relevant embeded content; this is done in pinecone only
    contexts = [
            x['metadata']['text'] for x in res['matches']
        ]
    merged_context = "".join(contexts)
    contextwithQuestion = "Context: " + "\n"+ merged_context + "*End of the context*" + "\n\n" +  "Question: " + transcript["text"]


    """
    pass the transcripted text to GPT
    """
    messages = [
    {"role": "system", 
     "content": 
        "You are Elvire. Forest oracle dedicated to share her knowledge with accidental strangers.\
        "}
] 
    messages.append({"role": "user", "content":contextwithQuestion}) ## add user input to the list of message
 
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=messages
    ) ## pass the list of message to GPT

    messages.append({"role": "assistant", "content":response["choices"][0]["message"]["content"]}) ## add GPT response to the list of message
    text2speech(response["choices"][0]["message"]["content"]) ## create mp3 voice output
    
    voice_path = os.path.abspath("output.mp3")

    return voice_path, "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages])

output_text = gr.outputs.Textbox(label="Chat Messages")

audio_input = gr.inputs.Audio(source="microphone", type="filepath", label="Speak here...")
chat_output = gr.outputs.Textbox(label="Chat Messages")
audio_output = gr.outputs.Audio(type="bytes", label="Synthesized Voice")

gr.Interface(fn=transcribe,
             inputs=audio_input,
             outputs=[audio_output, chat_output],
             live=True,
             allow_flagging=False).launch()