Spaces:

Vira21
/

Khmer_SeaLLM

Running

Vira21 commited on 3 days ago

Commit

3c50dcb

•

1 Parent(s): bd031de

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import torch
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import gradio as gr
 # Constants for the Model
@@ -10,11 +10,12 @@ MODEL_DESC = "A demo for the SeaLLMs-v3-7B-Chat language model."
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
-# Load the model with efficient settings
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_PATH,
     device_map="auto",
-    torch_dtype=torch.float32,  # Switching to float32 to avoid numerical instability
     low_cpu_mem_usage=True
 )
@@ -31,7 +32,7 @@ def generate_response(prompt):
     try:
         outputs = model.generate(
             **inputs,
-            max_length=512,
             num_return_sequences=1,
             no_repeat_ngram_size=2,
             early_stopping=True,

 import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
 import gradio as gr
 # Constants for the Model
 # Load the tokenizer
 tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
+# Load the model with efficient settings, using 8-bit quantization to reduce memory usage
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
 model = AutoModelForCausalLM.from_pretrained(
     MODEL_PATH,
     device_map="auto",
+    quantization_config=quantization_config,
     low_cpu_mem_usage=True
 )
     try:
         outputs = model.generate(
             **inputs,
+            max_length=256,  # Reduced max_length to lower memory usage
             num_return_sequences=1,
             no_repeat_ngram_size=2,
             early_stopping=True,