from model import ExLlama, ExLlamaCache, ExLlamaConfig from tokenizer import ExLlamaTokenizer from generator import ExLlamaGenerator from lora import ExLlamaLora import os, glob import torch # Directory containt model, tokenizer, generator model_directory = "/mnt/str/models/_test_models/Neko-Institute-of-Science_LLaMA-7B-4bit-128g/" # Directory containing LoRA config and weights lora_directory = "/mnt/str/models/_test_loras/tloen_alpaca-lora-7b/" # Locate files we need within those directories tokenizer_path = os.path.join(model_directory, "tokenizer.model") model_config_path = os.path.join(model_directory, "config.json") st_pattern = os.path.join(model_directory, "*.safetensors") model_path = glob.glob(st_pattern) lora_config_path = os.path.join(lora_directory, "adapter_config.json") lora_path = os.path.join(lora_directory, "adapter_model.bin") # Create config, model, tokenizer and generator config = ExLlamaConfig(model_config_path) # create config from config.json config.model_path = model_path # supply path to model weights file model = ExLlama(config) # create ExLlama instance and load the weights tokenizer = ExLlamaTokenizer(tokenizer_path) # create tokenizer from tokenizer model file cache = ExLlamaCache(model) # create cache for inference generator = ExLlamaGenerator(model, tokenizer, cache) # create generator # Load LoRA lora = ExLlamaLora(model, lora_config_path, lora_path) # Configure generator generator.settings.token_repetition_penalty_max = 1.2 generator.settings.temperature = 0.65 generator.settings.top_p = 0.4 generator.settings.top_k = 0 generator.settings.typical = 0.0 # Alpaca prompt prompt = \ "Below is an instruction that describes a task. Write a response that appropriately completes the request.\n" \ "\n" \ "### Instruction:\n" \ "List five colors in alphabetical order.\n" \ "\n" \ "### Response:" # Generate with LoRA print(" --- LoRA ----------------- ") print("") generator.lora = lora torch.manual_seed(1337) output = generator.generate_simple(prompt, max_new_tokens = 200) print(output) # Generate without LoRA print("") print(" --- No LoRA -------------- ") print("") generator.lora = None torch.manual_seed(1337) output = generator.generate_simple(prompt, max_new_tokens = 200) print(output)