Spaces:
Build error
Build error
from model import ExLlama, ExLlamaCache, ExLlamaConfig | |
from tokenizer import ExLlamaTokenizer | |
from generator import ExLlamaGenerator | |
import os, glob | |
# Directory containing model, tokenizer, generator | |
model_directory = "/mnt/str/models/llama-13b-4bit-128g/" | |
# Locate files we need within that directory | |
tokenizer_path = os.path.join(model_directory, "tokenizer.model") | |
model_config_path = os.path.join(model_directory, "config.json") | |
st_pattern = os.path.join(model_directory, "*.safetensors") | |
model_path = glob.glob(st_pattern) | |
# Batched prompts | |
prompts = [ | |
"Once upon a time,", | |
"I don't like to", | |
"A turbo encabulator is a", | |
"In the words of Mark Twain," | |
] | |
# Create config, model, tokenizer and generator | |
config = ExLlamaConfig(model_config_path) # create config from config.json | |
config.model_path = model_path # supply path to model weights file | |
model = ExLlama(config) # create ExLlama instance and load the weights | |
tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file | |
cache = ExLlamaCache(model, batch_size = len(prompts)) # create cache for inference | |
generator = ExLlamaGenerator(model, tokenizer, cache) # create generator | |
# Configure generator | |
generator.disallow_tokens([tokenizer.eos_token_id]) | |
generator.settings.token_repetition_penalty_max = 1.2 | |
generator.settings.temperature = 0.95 | |
generator.settings.top_p = 0.65 | |
generator.settings.top_k = 100 | |
generator.settings.typical = 0.5 | |
# Generate, batched | |
for line in prompts: | |
print(line) | |
output = generator.generate_simple(prompts, max_new_tokens = 200) | |
for line in output: | |
print("---") | |
print(line) | |