SorcererLM-8x22b-bf16 / train /sorc.toml

Upload 2 files

4fa1155 verified about 1 month ago

No virus

6.46 kB

	# Paths
	model = '/workspace/model'
	output_dir = '/workspace/out'

	# Lora configuration
	# can use full_fine_tune=true and no quantization to train the whole model instead of a LoRA
	#full_fine_tune = true
	lora_rank = 16
	lora_alpha = 32
	lora_dropout = 0.05

	# Train only specific modules. This is passed to the parameter of the same name in the LoraConfig.
	# If not set, adapt all linear modules.
	# Note, this ALSO affects full fine tuning. In that case, if this is set, only weights containing one
	# of these keys as substring will have requires_grad. If not set everything is trained.
	#target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']

	# can specify layers to adapt with LoRA if you want
	#layers_to_transform = '16:31'

	# for Mixtral, set the load balancing coefficient
	# load_balancing_loss_coef = 0.02

	# Optimization configuration
	epochs = 2
	lr_scheduler = 'cosine' # can also be 'constant'
	warmup_steps = 50

	# might be useful if resuming from a checkpoint and you want to change the LR and force it to something
	#force_constant_lr = 5e-5

	# hard clamp the magnitude of the LoRA weights
	#scale_weight_norms = 1.0

	# dynamic batch size, targeting this many tokens per batch, per device
	# if set, completely ignores the batch size in the deepspeed JSON config file
	# can be thought of as a replacement for sample packing
	batch_size_tokens = 10000

	# Performance settings
	pipeline_stages = 8 # number of pipeline parallel stages, must evenly divide the number of GPUs you launch the script with
	logging_steps = 10 # how often to log in Tensorboard
	eval_steps = 500
	save_steps = 500
	checkpoint_every_n_minutes = 60
	eval_before_first_step = false # do an eval before any training happens
	# dtype to load the underlying model weights in
	model_weight_dtype = 'bfloat16'
	# dtype for the LoRA weights
	lora_weight_dtype = 'bfloat16'
	# Can have the saved weights be different dtype. Don't need to set this. Could be useful for
	# training in float32 but saving with float16.
	#save_dtype = 'bfloat16'
	# Keep this number of stepXXXX (model saves) and global_stepXXX (checkpoint saves) and delete the rest
	# (this only applies to the current training session, and resumed training sessions will not touch
	# old saves)
	keep_states = 5

	# sort examples by length before dividing them into batches
	# this makes all examples in a batch approximately the same length, to minimize padding
	# the batches are still shuffled after that
	# you should probably always have this set to true
	group_by_length = true

	# This can also be 'unsloth' to offload hidden states to CPU, saving potentially a lot of VRAM
	# for a minor performance hit.
	# Example: 4x4090, PCIE 3.0 16x, pipeline_stages=4, training QLoRA on Llama 3 70B with 4096 sequence length.
	# true: 75s step time, 19.7G peak per-GPU VRAM usage.
	# 'unsloth': 78s step time, 16.2G peak per-GPU VRAM usage.
	activation_checkpointing = 'unsloth'

	# Keep MLP weights on system RAM until they are needed. Can save a ton of VRAM with a
	# moderate hit to performance. If using an MoE model, this can also be an integer, in
	# which case only that many experts are offloaded (tradeoff between VRAM and speed).
	offload_mlp_to_cpu = 2

	# Resume a prior run
	# if true, we attempt to resume training from the most recent directory inside output_dir (the directory names are timestamps)
	# so, to resume, just run the exact same command but set this to true first
	resume_from_checkpoint = false

	# Loading the optimizer states seems to cause some kind of unavoidable VRAM memory leak.
	# It's very small, only about 0.2 GB in cases I've seen. But if you are very close to the
	# limit, it can cause resuming from checkpoint to OOM. As a last resort, you can uncomment
	# this to not load the optimizer states and hopefully the resumption won't OOM.
	#load_optimizer_states = false


	# Dataset configuration

	# How to combine multiple datasets if you have more than one.
	# Can be 'concatenate' or 'interleave'. Will be 'concatenate' if not set.
	dataset_combination_mode = 'interleave'
	# When to stop interleaving datasets when using mode 'interleave'. Either 'first_exhausted' or 'all_exhausted'.
	# Default if not set: 'first_exhausted'
	dataset_interleave_stopping_strategy = 'all_exhausted'
	# Can set this lower than training, so we don't drop as many examples when trying to make equal-sized batches.
	# Default if not set: same as training GAS.
	eval_gradient_accumulation_steps = 1

	# bitsandbytes 4 bit quantization. The parameters here become arguments to Transformers BitsAndBytesConfig.
	#[quantization.bnb]
	#load_in_4bit = true
	#bnb_4bit_use_double_quant = false
	#bnb_4bit_compute_dtype = 'bfloat16'

	# HQQ quantization. The parameters here become arguments to CustomHQQConfig.
	# [quantization.hqq]
	# nbits = 4
	# group_size = 64
	# compute_dtype = 'bfloat16'

	# (Optional) You can override the quant params for certain modules. This does substring matching, e.g. if 'gate_proj'
	# is a substring of the full module name, anything specified overwrites the defaults in [quantization.hqq].
	# [quantization.hqq.dynamic_config]
	# gate_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
	# up_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}
	# down_proj = {nbits = 2, group_size = 16, quant_zero = true, quant_scale = true}

	[optimizer]
	# options: adamw_kahan, AdamW, AdamW8bit
	type = 'adamw_kahan'
	lr = 5e-5
	beta1 = 0.9
	beta2 = 0.99
	weight_decay = 0.1

	[[datasets]]
	# Arbitrary name, used only for separately logging eval metrics. Will be dataset0, dataset1, etc if not set.
	name = 'c2'
	dataset_type = 'axolotl'
	dataset_path = '../axolotl/sorc.yml'
	sequence_len = 8192
	eval_size = 0.01
	# Relative sampling weight, when using combination mode 'interleave'. Will be 1 if not set.
	sample_weight = 1

	#[[datasets]]
	#name = 'capybara'
	#dataset_type = 'axolotl'
	#dataset_path = 'examples/capybara.yml'
	#sequence_len = 2048
	#eval_size = 0.02
	#sample_weight = 1.5

	# In addition to using eval_size which splits off some of the dataset, we can have completely separate datasets for eval.
	# This can be useful if you're training on raw text data, so that the eval set remains completely fixed, even if
	# you change training sequence_len, etc.
	# This is just an example, typically you wouldn't have this overlap a training dataset.
	# [[eval_datasets]]
	# name = 'capybara'
	# dataset_type = 'axolotl'
	# dataset_path = 'examples/capybara.yml'
	# sequence_len = 2048