# import gradio as gr # import os # gr.load("models/google/gemma-1.1-7b-it", hf_token=os.environ.get("YOUR_API_TOKEN"), streaming=True).launch() import gradio as gr import os os.system('pip install openai') from openai import OpenAI client = OpenAI( base_url="https://api-inference.huggingface.co/v1", api_key=os.environ.get('YOUR_API_TOKEN') ) def predict(message, history, test=""): print("1 ", message) print("2 ", history) history_openai_format = [] for human, assistant in history: history_openai_format.append({"role": "user", "content": human }) history_openai_format.append({"role": "assistant", "content":assistant}) history_openai_format.append({"role": "user", "content": message}) response = client.chat.completions.create(model='meta-llama/Meta-Llama-3-8B-Instruct', # response = client.chat.completions.create(model='nvidia/Llama3-ChatQA-1.5-8B', messages= history_openai_format, temperature=0.7, stream=True, max_tokens=3000) partial_message = "" for chunk in response: if chunk.choices[0].delta.content is not None: partial_message = partial_message + chunk.choices[0].delta.content yield partial_message gr.ChatInterface(predict).launch()