import gradio as gr # gr.load("models/kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0").launch() import re import torch from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0") model = AutoModelForCausalLM.from_pretrained("kirankunapuli/Gemma-2B-Hinglish-LORA-v1.0") device = "cuda:0" if torch.cuda.is_available() else "cpu" model = model.to(device) alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {} ### Input: {} ### Response: {}""" def get_response(input_text: str) -> str: inputs = tokenizer( [ alpaca_prompt.format( "Please answer the following sentence as requested", # instruction input_text, # input "", # output - leave this blank for generation! ) ], return_tensors="pt", ).to(device) outputs = model.generate(**inputs, max_new_tokens=256) output = tokenizer.batch_decode(outputs)[0] response_pattern = re.compile(r"### Response:\n(.*?)", re.DOTALL) response_match = response_pattern.search(output) if response_match: response = response_match.group(1).strip() return response else: return "Response not found" interface = gr.Interface( fn=get_response, inputs="text", outputs="text", title="Gemma Hinglish Model Inference", ) interface.launch()