from ctransformers import AutoModelForCausalLM from fastapi import FastAPI from pydantic import BaseModel # Model loading from local file llm = AutoModelForCausalLM.from_pretrained("./model", model_type='mistral', threads=2) # Pydantic object class Validation(BaseModel): inputs: str temperature: float = 0.0 max_new_tokens: int = 1048 top_p: float = 0.15 repetition_penalty: float = 1.0 # FastAPI app = FastAPI() # Generate LLM completion @app.post("/") async def stream(item: Validation): response = llm(item.inputs, temperature=item.temperature, max_new_tokens=item.max_new_tokens, top_p=item.top_p, repetition_penalty=item.repetition_penalty) return response