Update config.json to have the same settings as
#2
by
Joseph717171
- opened
@numen-tech , I noticed you are missing something from your config.json. Specifically, but not limited to the additional EOS token definitions. Please update this. I think Private LLM's Omniquants of Llama-3.1-8B-Instruct will work better. These changes probably also should be applied to your ablated versions of Lllama-3.1-8B-Instruct as well. π
Where,
- a = meta-llama/Meta-Llama-3.1-8B-Instruct/config.json
- b = numen-tech/Meta-Llama-3.1-8B-Instruct-w4a16g128asym/private-llm-config.json
jsarnecki@Josephs-MacBook-Pro ~ % diff $a $b
2,18d1
< "architectures": [
< "LlamaForCausalLM"
< ],
< "attention_bias": false,
< "attention_dropout": 0.0,
< "bos_token_id": 128000,
< "eos_token_id": [
< 128001,
< 128008,
< 128009
< ],
< "hidden_act": "silu",
< "hidden_size": 4096,
< "initializer_range": 0.02,
< "intermediate_size": 14336,
< "max_position_embeddings": 131072,
< "mlp_bias": false,
20,30c3,17
< "num_attention_heads": 32,
< "num_hidden_layers": 32,
< "num_key_value_heads": 8,
< "pretraining_tp": 1,
< "rms_norm_eps": 1e-05,
< "rope_scaling": {
< "factor": 8.0,
< "low_freq_factor": 1.0,
< "high_freq_factor": 4.0,
< "original_max_position_embeddings": 8192,
< "rope_type": "llama3"
---
> "quantization": "w4a16g128asym",
> "model_config": {
> "hidden_size": 4096,
> "intermediate_size": 14336,
> "num_attention_heads": 32,
> "num_hidden_layers": 32,
> "rms_norm_eps": 1e-05,
> "vocab_size": 128256,
> "position_embedding_base": 500000.0,
> "context_window_size": 4096,
> "prefill_chunk_size": 128,
> "num_key_value_heads": 8,
> "head_dim": 128,
> "tensor_parallel_shards": 1,
> "max_batch_size": 80
32,37c19,41
< "rope_theta": 500000.0,
< "tie_word_embeddings": false,
< "torch_dtype": "bfloat16",
< "transformers_version": "4.42.3",
< "use_cache": true,
< "vocab_size": 128256
---
> "vocab_size": 128256,
> "context_window_size": 4096,
> "sliding_window_size": -1,
> "prefill_chunk_size": 128,
> "attention_sink_size": -1,
> "tensor_parallel_shards": 1,
> "mean_gen_len": 128,
> "max_gen_len": 512,
> "shift_fill_factor": 0.3,
> "temperature": 0.6,
> "presence_penalty": 0.0,
> "frequency_penalty": 0.0,
> "repetition_penalty": 1.0,
> "top_p": 0.9,
> "conv_template": "llama-3.1",
> "pad_token_id": 0,
> "bos_token_id": 128000,
> "eos_token_id": 128001,
> "tokenizer_files": [
> "tokenizer.json",
> "tokenizer_config.json"
> ],
> "version": "0.1.0"
The following updated private-llm-config.json, which seamlessly combines meta-llama/Meta-Llama-3.1-8B-Instruct/config.json and numen-tech/Meta-Llama-3.1-8B-Instruct-w4a16g128asym/private-llm-config.json is a concerted effort of LLaMa-3.1-70B-Instruct (Groq.com), Grok-2 (Beta), and me. Diff included. Made a pull-request. π
{
"model_type": "llama",
"quantization": "w4a16g128asym",
"model_config": {
"hidden_size": 4096,
"intermediate_size": 14336,
"num_attention_heads": 32,
"num_hidden_layers": 32,
"rms_norm_eps": 1e-05,
"vocab_size": 128256,
"position_embedding_base": 500000.0,
"context_window_size": 4096,
"prefill_chunk_size": 128,
"num_key_value_heads": 8,
"head_dim": 128,
"tensor_parallel_shards": 1,
"max_batch_size": 80
},
"vocab_size": 128256,
"context_window_size": 4096,
"sliding_window_size": -1,
"prefill_chunk_size": 128,
"attention_sink_size": -1,
"tensor_parallel_shards": 1,
"mean_gen_len": 128,
"max_gen_len": 512,
"shift_fill_factor": 0.3,
"temperature": 0.6,
"presence_penalty": 0.0,
"frequency_penalty": 0.0,
"repetition_penalty": 1.0,
"top_p": 0.9,
"conv_template": "llama-3.1",
"pad_token_id": 0,
"bos_token_id": 128000,
"eos_token_id": [
128001,
128008,
128009
],
"tokenizer_files": [
"tokenizer.json",
"tokenizer_config.json"
],
"version": "0.1.0",
"architectures": [
"LlamaForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"hidden_act": "silu",
"initializer_range": 0.02,
"max_position_embeddings": 131072,
"mlp_bias": false,
"pretraining_tp": 1,
"rope_scaling": {
"factor": 8.0,
"low_freq_factor": 1.0,
"high_freq_factor": 4.0,
"original_max_position_embeddings": 8192,
"rope_type": "llama3"
},
"rope_theta": 500000.0,
"tie_word_embeddings": false,
"torch_dtype": "bfloat16",
"transformers_version": "4.42.3",
"use_cache": true
}
Where,
- a = The combination of meta-llama/Meta-Llama-3.1-8B-Instruct/config.json and numen-tech/Meta-Llama-3.1-8B-Instruct-w4a16g128asym/private-llm-config.json
- b = numen-tech/Meta-Llama-3.1-8B-Instruct-w4a16g128asym/private-llm-config.json
diff $a $b
36,40c36
< "eos_token_id": [
< 128001,
< 128008,
< 128009
< ],
---
> "eos_token_id": 128001,
45,67c41
< "version": "0.1.0",
< "architectures": [
< "LlamaForCausalLM"
< ],
< "attention_bias": false,
< "attention_dropout": 0.0,
< "hidden_act": "silu",
< "initializer_range": 0.02,
< "max_position_embeddings": 131072,
< "mlp_bias": false,
< "pretraining_tp": 1,
< "rope_scaling": {
< "factor": 8.0,
< "low_freq_factor": 1.0,
< "high_freq_factor": 4.0,
< "original_max_position_embeddings": 8192,
< "rope_type": "llama3"
< },
< "rope_theta": 500000.0,
< "tie_word_embeddings": false,
< "torch_dtype": "bfloat16",
< "transformers_version": "4.42.3",
< "use_cache": true
---
> "version": "0.1.0"