# import gradio as gr
# import os

# gr.load("models/google/gemma-1.1-7b-it", hf_token=os.environ.get("YOUR_API_TOKEN"), streaming=True).launch()

import gradio as gr
import os
os.system('pip install openai')
from openai import OpenAI

client = OpenAI(
  base_url="https://api-inference.huggingface.co/v1",
  api_key=os.environ.get('YOUR_API_TOKEN')
) 

def predict(message, history, test=""):
    print("1 ", message)
    print("2 ", history)
    history_openai_format = []
    for human, assistant in history:
        history_openai_format.append({"role": "user", "content": human })
        history_openai_format.append({"role": "assistant", "content":assistant})
    history_openai_format.append({"role": "user", "content": message})
  
    response = client.chat.completions.create(model='meta-llama/Meta-Llama-3-8B-Instruct',
    # response = client.chat.completions.create(model='nvidia/Llama3-ChatQA-1.5-8B',
    messages= history_openai_format,
    temperature=0.7,
    stream=True,
    max_tokens=3000)

    partial_message = ""
    for chunk in response:
        if chunk.choices[0].delta.content is not None:
              partial_message = partial_message + chunk.choices[0].delta.content
              yield partial_message

gr.ChatInterface(predict).launch()