import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer model_name = "TinyPixel/Llama-2-7B-bf16-sharded" bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16, ) model = AutoModelForCausalLM.from_pretrained( model_name, quantization_config=bnb_config, trust_remote_code=True ) model.config.use_cache = False """Let's also load the tokenizer below""" tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token from peft import LoraConfig, get_peft_model lora_alpha = 16 lora_dropout = 0.1 lora_r = 64 peft_config = LoraConfig( lora_alpha=lora_alpha, lora_dropout=lora_dropout, r=lora_r, bias="none", task_type="CAUSAL_LM" ) """## Loading the trainer Here we will use the [`SFTTrainer` from TRL library](https://huggingface.co/docs/trl/main/en/sft_trainer) that gives a wrapper around transformers `Trainer` to easily fine-tune models on instruction based datasets using PEFT adapters. Let's first load the training arguments below. """ from transformers import TrainingArguments output_dir = "./results" per_device_train_batch_size = 4 gradient_accumulation_steps = 4 optim = "paged_adamw_32bit" save_steps = 100 logging_steps = 10 learning_rate = 2e-4 max_grad_norm = 0.3 max_steps = 100 warmup_ratio = 0.03 lr_scheduler_type = "constant" training_arguments = TrainingArguments( output_dir=output_dir, per_device_train_batch_size=per_device_train_batch_size, gradient_accumulation_steps=gradient_accumulation_steps, optim=optim, save_steps=save_steps, logging_steps=logging_steps, learning_rate=learning_rate, fp16=True, max_grad_norm=max_grad_norm, max_steps=max_steps, warmup_ratio=warmup_ratio, group_by_length=True, lr_scheduler_type=lr_scheduler_type, ) """Then finally pass everthing to the trainer""" from trl import SFTTrainer max_seq_length = 512 trainer = SFTTrainer( model=model, train_dataset=dataset, peft_config=peft_config, dataset_text_field="text", max_seq_length=max_seq_length, tokenizer=tokenizer, args=training_arguments, ) """We will also pre-process the model by upcasting the layer norms in float 32 for more stable training""" for name, module in trainer.model.named_modules(): if "norm" in name: module = module.to(torch.float32) """## Train the model Now let's train the model! Simply call `trainer.train()` """ trainer.train() """During training, the model should converge nicely as follows: ![image](https://huggingface.co/datasets/trl-internal-testing/example-images/resolve/main/images/loss-falcon-7b.png) The `SFTTrainer` also takes care of properly saving only the adapters during training instead of saving the entire model. """ model_to_save = trainer.model.module if hasattr(trainer.model, 'module') else trainer.model # Take care of distributed/parallel training model_to_save.save_pretrained("outputs") lora_config = LoraConfig.from_pretrained('outputs') model = get_peft_model(model, lora_config) dataset['text'] text = "Écrire un texte dans un style baroque sur la glace et le feu ### Assistant: Si j'en luis éton" device = "cuda:0" inputs = tokenizer(text, return_tensors="pt").to(device) outputs = model.generate(**inputs, max_new_tokens=50) print(tokenizer.decode(outputs[0], skip_special_tokens=True)) from huggingface_hub import login login() model.push_to_hub("llama2-qlora-finetunined-french")