import gradio as gr import spaces # import torch from huggingface_hub import hf_hub_download from llama_cpp import Llama, LlamaGrammar # zero = torch.Tensor([0]).cuda() # print(f'zero.device: {zero.device}') # <-- 'cpu' 🤔 @spaces.GPU def greet(n): global llm llm = load_model(download_model()) # print(f'zero.device: {zero.device}') # <-- 'cuda:0' 🤗 grammar = LlamaGrammar.from_string(''' root ::= sentence answer ::= (weather | complaint | yesno | gen) weather ::= ("Sunny." | "Cloudy." | "Rainy.") complaint ::= "I don't like talking about the weather." yesno ::= ("Yes." | "No.") gen ::= "1. " [A-Z] [a-z] [a-z]* sentence ::= [A-Z] [A-Za-z0-9 ,-]* ("." | "!" | "?") ''') prompts = [ "How's the weather in London?", "How's the weather in Munich?", "How's the weather in Barcelona?", ] print(f'Making a big inference...... {prompts[0]}') output = llm( prompts[0], max_tokens=512, temperature=0.4, grammar=grammar ) print(f'Returned..... {output}') s = output['choices'][0]['text'] print(f'{s} , len(s) = {len(s)}') print(output['choices']) print(output['choices'][0]['text']) print() return f"Hello {s} Tensor" def download_model(): REPO_ID = "TheBloke/Llama-2-7B-GGUF" FILENAME = "llama-2-7b.Q5_K_S.gguf" print(f'Downloading model {REPO_ID}/{FILENAME}') m = hf_hub_download(repo_id=REPO_ID, filename=FILENAME) print(f'status: {m}') return m def load_model(fp): from llama_cpp import Llama, LlamaGrammar print(f'Loading model: {fp}') model_file=fp llm = Llama( model_path=model_file, n_gpu_layers=-1, verbose=True ) return llm demo = gr.Interface(fn=greet, inputs=gr.Number(), outputs=gr.Text()) demo.launch(share=False)