diff --git a/config.json b/config.json new file mode 100644 index 0000000000000000000000000000000000000000..55ba10e6b55f6ab7c440d1d15758bac9559708f5 --- /dev/null +++ b/config.json @@ -0,0 +1,40 @@ +{ + "_name_or_path": "cogvlm-base-490", + "architectures": [ + "CogVLMForCausalLM" + ], + "auto_map": { + "AutoConfig": "configuration_cogvlm.CogVLMConfig", + "AutoModelForCausalLM": "modeling_cogvlm.CogVLMForCausalLM" + }, + "bos_token_id": 1, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 11008, + "max_position_embeddings": 2048, + "num_attention_heads": 32, + "num_hidden_layers": 32, + "pad_token_id": 0, + "rms_norm_eps": 1e-05, + "template_version": "base", + "tie_word_embeddings": false, + "torch_dtype": "bfloat16", + "transformers_version": "4.35.0", + "use_cache": true, + "vision_config": { + "dropout_prob": 0.0, + "hidden_act": "gelu", + "hidden_size": 1792, + "image_size": 490, + "in_channels": 3, + "intermediate_size": 15360, + "layer_norm_eps": 1e-06, + "num_heads": 16, + "num_hidden_layers": 63, + "num_positions": 1226, + "patch_size": 14 + }, + "vocab_size": 32000 +} diff --git a/configuration_cogvlm.py b/configuration_cogvlm.py new file mode 100644 index 0000000000000000000000000000000000000000..60d487ac530bb1aa6e13828e2a17ae2f521b02d1 --- /dev/null +++ b/configuration_cogvlm.py @@ -0,0 +1,45 @@ +from typing import Literal +from transformers import PretrainedConfig + + +class CogVLMConfig(PretrainedConfig): + _auto_class = "AutoConfig" + + def __init__( + self, + vocab_size=32000, + hidden_size=4096, + intermediate_size=11008, + num_hidden_layers=32, + num_attention_heads=32, + hidden_act='silu', + max_position_embeddings=2048, + initializer_range=0.02, + rms_norm_eps=1e-06, + template_version: Literal["base", "chat"] = "chat", + + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + tie_word_embeddings=False, + use_cache=True, + **kwargs, + ): + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_attention_heads = num_attention_heads + self.max_position_embeddings = max_position_embeddings + self.rms_norm_eps = rms_norm_eps + self.initializer_range = initializer_range + self.vocab_size = vocab_size + self.num_hidden_layers = num_hidden_layers + self.hidden_act = hidden_act + self.template_version = template_version + self.use_cache = use_cache + super().__init__( + pad_token_id=pad_token_id, + bos_token_id=bos_token_id, + eos_token_id=eos_token_id, + tie_word_embeddings=tie_word_embeddings, + **kwargs, + ) diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..a4fee64c3bea2cc2488999c0611a0602930b7e93 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,7 @@ +{ + "_from_model_config": true, + "bos_token_id": 1, + "eos_token_id": 2, + "pad_token_id": 0, + "transformers_version": "4.35.0" +} diff --git a/model-00001-of-00008.safetensors b/model-00001-of-00008.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..19cccb1ab3123e57b0923e422eaabc90fbd4c9e9 --- /dev/null +++ b/model-00001-of-00008.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bae7ad427de1d384610cb51ebb49a62612452eb25c4c3b11dfd74420dfaa526 +size 4938885184 diff --git a/model-00002-of-00008.safetensors b/model-00002-of-00008.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e495da925b0c23657e73a778a82a3a293bc22e09 --- /dev/null +++ b/model-00002-of-00008.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd620e934aabc590bb2dc0bce8681739a9f08c2fa939885aeb80258575697fb5 +size 4947290688 diff --git a/model-00003-of-00008.safetensors b/model-00003-of-00008.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..aeda8c8043806a508bb20055d41cc39d295069e0 --- /dev/null +++ b/model-00003-of-00008.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d29f7b16fbe52bfb99969a9ff6e35e0c87b70fc2d2fc880cc697913a2c0dbbbd +size 4947307592 diff --git a/model-00004-of-00008.safetensors b/model-00004-of-00008.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..ac9fa202ea660d086759d895eca8e1439a156cc8 --- /dev/null +++ b/model-00004-of-00008.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe8d4b77588d179ea01b0b7c6a738eb726742bed8c357bc9f4d3ccf69ac67bb3 +size 4991331080 diff --git a/model-00005-of-00008.safetensors b/model-00005-of-00008.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..48e3747a364b4977c47668a148de47e79a69e9cd --- /dev/null +++ b/model-00005-of-00008.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ed3cb6b3446edc32473590757c21c825226c4d24d2cd406f577b904e089946a +size 4991331088 diff --git a/model-00006-of-00008.safetensors b/model-00006-of-00008.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..4e797b318e9c9e520f3a6af4c9d61f3d3cafabae --- /dev/null +++ b/model-00006-of-00008.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03991a93227d4784edf14892ff9216c1cc935cf0af0d4b7b19b14b1e3f46db52 +size 4970162920 diff --git a/model-00007-of-00008.safetensors b/model-00007-of-00008.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..3f978019ad0d6a01f90118f83477669677331c6d --- /dev/null +++ b/model-00007-of-00008.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cbbb7cb1dc8ce10f0a99b0d9966c692bdcd6eb8bdacb8d2a45448f06d713e31 +size 4960543792 diff --git a/model-00008-of-00008.safetensors b/model-00008-of-00008.safetensors new file mode 100644 index 0000000000000000000000000000000000000000..e9bf892e59c6938e3f1eb8af09080422f57a021f --- /dev/null +++ b/model-00008-of-00008.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73e043dce6a730889c948add179e59d991c0dd99cf882222c5ad57c25e1e8b3e +size 532677104 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..197b878b0e9f9b1ffb97a7c4da1fb834fffcaf97 --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,1194 @@ +{ + "metadata": { + "total_size": 35279374848 + }, + "weight_map": { + "lm_head.weight": "model-00008-of-00008.safetensors", + "model.embed_tokens.weight": "model-00001-of-00008.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00008.safetensors", + "model.layers.0.mlp.language_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.0.mlp.language_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.0.mlp.language_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.0.mlp.vision_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.0.mlp.vision_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.0.mlp.vision_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00008.safetensors", + "model.layers.0.self_attn.language_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.0.self_attn.language_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.0.self_attn.rotary_emb.inv_freq": "model-00001-of-00008.safetensors", + "model.layers.0.self_attn.vision_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.0.self_attn.vision_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00008.safetensors", + "model.layers.1.mlp.language_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.1.mlp.language_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.1.mlp.language_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.1.mlp.vision_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.1.mlp.vision_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.1.mlp.vision_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00008.safetensors", + "model.layers.1.self_attn.language_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.1.self_attn.language_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.1.self_attn.rotary_emb.inv_freq": "model-00001-of-00008.safetensors", + "model.layers.1.self_attn.vision_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.1.self_attn.vision_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.10.mlp.language_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.10.mlp.language_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.10.mlp.language_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.10.mlp.vision_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.10.mlp.vision_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.10.mlp.vision_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.10.self_attn.language_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.10.self_attn.language_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.10.self_attn.rotary_emb.inv_freq": "model-00002-of-00008.safetensors", + "model.layers.10.self_attn.vision_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.10.self_attn.vision_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.11.input_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.11.mlp.language_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.11.mlp.language_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.11.mlp.language_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.11.mlp.vision_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.11.mlp.vision_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.11.mlp.vision_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.11.self_attn.language_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.11.self_attn.language_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.11.self_attn.rotary_emb.inv_freq": "model-00002-of-00008.safetensors", + "model.layers.11.self_attn.vision_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.11.self_attn.vision_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.12.input_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.12.mlp.language_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.12.mlp.language_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.12.mlp.language_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.12.mlp.vision_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.12.mlp.vision_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.12.mlp.vision_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.12.self_attn.language_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.12.self_attn.language_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.12.self_attn.rotary_emb.inv_freq": "model-00003-of-00008.safetensors", + "model.layers.12.self_attn.vision_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.12.self_attn.vision_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.13.input_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.13.mlp.language_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.13.mlp.language_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.13.mlp.language_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.13.mlp.vision_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.13.mlp.vision_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.13.mlp.vision_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.13.self_attn.language_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.13.self_attn.language_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.13.self_attn.rotary_emb.inv_freq": "model-00003-of-00008.safetensors", + "model.layers.13.self_attn.vision_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.13.self_attn.vision_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.14.input_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.14.mlp.language_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.14.mlp.language_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.14.mlp.language_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.14.mlp.vision_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.14.mlp.vision_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.14.mlp.vision_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.14.self_attn.language_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.14.self_attn.language_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.14.self_attn.rotary_emb.inv_freq": "model-00003-of-00008.safetensors", + "model.layers.14.self_attn.vision_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.14.self_attn.vision_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.15.input_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.15.mlp.language_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.15.mlp.language_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.15.mlp.language_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.15.mlp.vision_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.15.mlp.vision_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.15.mlp.vision_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.15.self_attn.language_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.15.self_attn.language_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.15.self_attn.rotary_emb.inv_freq": "model-00003-of-00008.safetensors", + "model.layers.15.self_attn.vision_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.15.self_attn.vision_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.16.input_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.16.mlp.language_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.16.mlp.language_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.16.mlp.language_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.16.mlp.vision_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.16.mlp.vision_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.16.mlp.vision_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.16.self_attn.language_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.16.self_attn.language_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.16.self_attn.rotary_emb.inv_freq": "model-00003-of-00008.safetensors", + "model.layers.16.self_attn.vision_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.16.self_attn.vision_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.17.input_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.17.mlp.language_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.17.mlp.language_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.17.mlp.language_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.17.mlp.vision_mlp.down_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.17.mlp.vision_mlp.gate_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.17.mlp.vision_mlp.up_proj.weight": "model-00003-of-00008.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00003-of-00008.safetensors", + "model.layers.17.self_attn.language_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.17.self_attn.language_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.17.self_attn.rotary_emb.inv_freq": "model-00003-of-00008.safetensors", + "model.layers.17.self_attn.vision_expert_dense.weight": "model-00003-of-00008.safetensors", + "model.layers.17.self_attn.vision_expert_query_key_value.weight": "model-00003-of-00008.safetensors", + "model.layers.18.input_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.18.mlp.language_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.18.mlp.language_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.18.mlp.language_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.18.mlp.vision_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.18.mlp.vision_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.18.mlp.vision_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.18.self_attn.language_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.18.self_attn.language_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.18.self_attn.rotary_emb.inv_freq": "model-00003-of-00008.safetensors", + "model.layers.18.self_attn.vision_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.18.self_attn.vision_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.19.input_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.19.mlp.language_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.19.mlp.language_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.19.mlp.language_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.19.mlp.vision_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.19.mlp.vision_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.19.mlp.vision_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.19.self_attn.language_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.19.self_attn.language_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.19.self_attn.rotary_emb.inv_freq": "model-00004-of-00008.safetensors", + "model.layers.19.self_attn.vision_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.19.self_attn.vision_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00008.safetensors", + "model.layers.2.mlp.language_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.2.mlp.language_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.2.mlp.language_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.2.mlp.vision_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.2.mlp.vision_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.2.mlp.vision_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00008.safetensors", + "model.layers.2.self_attn.language_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.2.self_attn.language_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.2.self_attn.rotary_emb.inv_freq": "model-00001-of-00008.safetensors", + "model.layers.2.self_attn.vision_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.2.self_attn.vision_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.20.input_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.20.mlp.language_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.20.mlp.language_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.20.mlp.language_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.20.mlp.vision_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.20.mlp.vision_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.20.mlp.vision_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.20.self_attn.language_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.20.self_attn.language_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.20.self_attn.rotary_emb.inv_freq": "model-00004-of-00008.safetensors", + "model.layers.20.self_attn.vision_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.20.self_attn.vision_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.21.input_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.21.mlp.language_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.21.mlp.language_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.21.mlp.language_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.21.mlp.vision_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.21.mlp.vision_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.21.mlp.vision_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.21.self_attn.language_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.21.self_attn.language_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.21.self_attn.rotary_emb.inv_freq": "model-00004-of-00008.safetensors", + "model.layers.21.self_attn.vision_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.21.self_attn.vision_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.22.input_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.22.mlp.language_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.22.mlp.language_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.22.mlp.language_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.22.mlp.vision_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.22.mlp.vision_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.22.mlp.vision_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.22.self_attn.language_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.22.self_attn.language_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.22.self_attn.rotary_emb.inv_freq": "model-00004-of-00008.safetensors", + "model.layers.22.self_attn.vision_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.22.self_attn.vision_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.23.input_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.23.mlp.language_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.23.mlp.language_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.23.mlp.language_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.23.mlp.vision_mlp.down_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.23.mlp.vision_mlp.gate_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.23.mlp.vision_mlp.up_proj.weight": "model-00004-of-00008.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00004-of-00008.safetensors", + "model.layers.23.self_attn.language_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.23.self_attn.language_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.23.self_attn.rotary_emb.inv_freq": "model-00004-of-00008.safetensors", + "model.layers.23.self_attn.vision_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.23.self_attn.vision_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.24.input_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.24.mlp.language_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.24.mlp.language_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.24.mlp.language_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.24.mlp.vision_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.24.mlp.vision_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.24.mlp.vision_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.24.self_attn.language_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.24.self_attn.language_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.24.self_attn.rotary_emb.inv_freq": "model-00004-of-00008.safetensors", + "model.layers.24.self_attn.vision_expert_dense.weight": "model-00004-of-00008.safetensors", + "model.layers.24.self_attn.vision_expert_query_key_value.weight": "model-00004-of-00008.safetensors", + "model.layers.25.input_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.25.mlp.language_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.25.mlp.language_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.25.mlp.language_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.25.mlp.vision_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.25.mlp.vision_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.25.mlp.vision_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.25.self_attn.language_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.25.self_attn.language_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.25.self_attn.rotary_emb.inv_freq": "model-00005-of-00008.safetensors", + "model.layers.25.self_attn.vision_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.25.self_attn.vision_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.26.input_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.26.mlp.language_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.26.mlp.language_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.26.mlp.language_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.26.mlp.vision_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.26.mlp.vision_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.26.mlp.vision_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.26.self_attn.language_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.26.self_attn.language_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.26.self_attn.rotary_emb.inv_freq": "model-00005-of-00008.safetensors", + "model.layers.26.self_attn.vision_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.26.self_attn.vision_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.27.input_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.27.mlp.language_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.27.mlp.language_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.27.mlp.language_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.27.mlp.vision_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.27.mlp.vision_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.27.mlp.vision_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.27.self_attn.language_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.27.self_attn.language_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.27.self_attn.rotary_emb.inv_freq": "model-00005-of-00008.safetensors", + "model.layers.27.self_attn.vision_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.27.self_attn.vision_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.28.input_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.28.mlp.language_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.28.mlp.language_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.28.mlp.language_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.28.mlp.vision_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.28.mlp.vision_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.28.mlp.vision_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.28.self_attn.language_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.28.self_attn.language_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.28.self_attn.rotary_emb.inv_freq": "model-00005-of-00008.safetensors", + "model.layers.28.self_attn.vision_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.28.self_attn.vision_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.29.input_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.29.mlp.language_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.29.mlp.language_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.29.mlp.language_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.29.mlp.vision_mlp.down_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.29.mlp.vision_mlp.gate_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.29.mlp.vision_mlp.up_proj.weight": "model-00005-of-00008.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00005-of-00008.safetensors", + "model.layers.29.self_attn.language_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.29.self_attn.language_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.29.self_attn.rotary_emb.inv_freq": "model-00005-of-00008.safetensors", + "model.layers.29.self_attn.vision_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.29.self_attn.vision_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00008.safetensors", + "model.layers.3.mlp.language_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.3.mlp.language_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.3.mlp.language_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.3.mlp.vision_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.3.mlp.vision_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.3.mlp.vision_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00008.safetensors", + "model.layers.3.self_attn.language_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.3.self_attn.language_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.3.self_attn.rotary_emb.inv_freq": "model-00001-of-00008.safetensors", + "model.layers.3.self_attn.vision_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.3.self_attn.vision_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.30.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.layers.30.mlp.language_mlp.down_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.30.mlp.language_mlp.gate_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.30.mlp.language_mlp.up_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.30.mlp.vision_mlp.down_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.30.mlp.vision_mlp.gate_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.30.mlp.vision_mlp.up_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.layers.30.self_attn.language_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.30.self_attn.language_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.30.self_attn.rotary_emb.inv_freq": "model-00005-of-00008.safetensors", + "model.layers.30.self_attn.vision_expert_dense.weight": "model-00005-of-00008.safetensors", + "model.layers.30.self_attn.vision_expert_query_key_value.weight": "model-00005-of-00008.safetensors", + "model.layers.31.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.layers.31.mlp.language_mlp.down_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.31.mlp.language_mlp.gate_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.31.mlp.language_mlp.up_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.31.mlp.vision_mlp.down_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.31.mlp.vision_mlp.gate_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.31.mlp.vision_mlp.up_proj.weight": "model-00006-of-00008.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.layers.31.self_attn.language_expert_dense.weight": "model-00006-of-00008.safetensors", + "model.layers.31.self_attn.language_expert_query_key_value.weight": "model-00006-of-00008.safetensors", + "model.layers.31.self_attn.rotary_emb.inv_freq": "model-00006-of-00008.safetensors", + "model.layers.31.self_attn.vision_expert_dense.weight": "model-00006-of-00008.safetensors", + "model.layers.31.self_attn.vision_expert_query_key_value.weight": "model-00006-of-00008.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00008.safetensors", + "model.layers.4.mlp.language_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.4.mlp.language_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.4.mlp.language_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.4.mlp.vision_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.4.mlp.vision_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.4.mlp.vision_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00008.safetensors", + "model.layers.4.self_attn.language_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.4.self_attn.language_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.4.self_attn.rotary_emb.inv_freq": "model-00001-of-00008.safetensors", + "model.layers.4.self_attn.vision_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.4.self_attn.vision_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.5.input_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.5.mlp.language_mlp.down_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.5.mlp.language_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.5.mlp.language_mlp.up_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.5.mlp.vision_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.5.mlp.vision_mlp.gate_proj.weight": "model-00001-of-00008.safetensors", + "model.layers.5.mlp.vision_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.5.self_attn.language_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.5.self_attn.language_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.5.self_attn.rotary_emb.inv_freq": "model-00001-of-00008.safetensors", + "model.layers.5.self_attn.vision_expert_dense.weight": "model-00001-of-00008.safetensors", + "model.layers.5.self_attn.vision_expert_query_key_value.weight": "model-00001-of-00008.safetensors", + "model.layers.6.input_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.6.mlp.language_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.6.mlp.language_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.6.mlp.language_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.6.mlp.vision_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.6.mlp.vision_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.6.mlp.vision_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.6.self_attn.language_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.6.self_attn.language_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.6.self_attn.rotary_emb.inv_freq": "model-00002-of-00008.safetensors", + "model.layers.6.self_attn.vision_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.6.self_attn.vision_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.7.input_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.7.mlp.language_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.7.mlp.language_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.7.mlp.language_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.7.mlp.vision_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.7.mlp.vision_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.7.mlp.vision_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.7.self_attn.language_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.7.self_attn.language_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.7.self_attn.rotary_emb.inv_freq": "model-00002-of-00008.safetensors", + "model.layers.7.self_attn.vision_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.7.self_attn.vision_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.8.input_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.8.mlp.language_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.8.mlp.language_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.8.mlp.language_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.8.mlp.vision_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.8.mlp.vision_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.8.mlp.vision_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.8.self_attn.language_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.8.self_attn.language_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.8.self_attn.rotary_emb.inv_freq": "model-00002-of-00008.safetensors", + "model.layers.8.self_attn.vision_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.8.self_attn.vision_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.9.mlp.language_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.9.mlp.language_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.9.mlp.language_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.9.mlp.vision_mlp.down_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.9.mlp.vision_mlp.gate_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.9.mlp.vision_mlp.up_proj.weight": "model-00002-of-00008.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00008.safetensors", + "model.layers.9.self_attn.language_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.9.self_attn.language_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.layers.9.self_attn.rotary_emb.inv_freq": "model-00002-of-00008.safetensors", + "model.layers.9.self_attn.vision_expert_dense.weight": "model-00002-of-00008.safetensors", + "model.layers.9.self_attn.vision_expert_query_key_value.weight": "model-00002-of-00008.safetensors", + "model.norm.weight": "model-00006-of-00008.safetensors", + "model.vision.boi": "model-00006-of-00008.safetensors", + "model.vision.eoi": "model-00006-of-00008.safetensors", + "model.vision.linear_proj.dense_4h_to_h.weight": "model-00008-of-00008.safetensors", + "model.vision.linear_proj.dense_h_to_4h.weight": "model-00008-of-00008.safetensors", + "model.vision.linear_proj.gate_proj.weight": "model-00008-of-00008.safetensors", + "model.vision.linear_proj.linear_proj.weight": "model-00007-of-00008.safetensors", + "model.vision.linear_proj.norm1.bias": "model-00007-of-00008.safetensors", + "model.vision.linear_proj.norm1.weight": "model-00007-of-00008.safetensors", + "model.vision.patch_embedding.cls_embedding": "model-00006-of-00008.safetensors", + "model.vision.patch_embedding.position_embedding.weight": "model-00006-of-00008.safetensors", + "model.vision.patch_embedding.proj.bias": "model-00006-of-00008.safetensors", + "model.vision.patch_embedding.proj.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.0.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.1.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.10.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.11.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.12.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.13.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.14.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.15.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.16.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.17.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.18.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.19.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.2.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.20.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.21.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.22.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.23.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.24.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.25.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.26.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.26.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.26.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.26.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.26.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.26.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.26.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.26.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.26.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.26.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.26.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.26.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.27.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.28.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.29.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.3.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.3.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.30.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.30.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.31.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.32.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.33.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.34.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.35.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.36.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.37.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.38.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.39.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.4.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.4.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.40.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.40.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.41.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.42.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.43.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.44.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.45.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.46.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.47.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.48.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.49.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.5.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.5.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.50.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.50.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.51.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.52.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.53.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.54.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.55.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.56.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.57.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.58.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.59.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.6.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.6.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.60.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.60.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.61.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.attention.dense.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.attention.dense.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.attention.query_key_value.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.attention.query_key_value.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.input_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.input_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.mlp.fc1.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.mlp.fc1.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.mlp.fc2.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.mlp.fc2.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.post_attention_layernorm.bias": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.62.post_attention_layernorm.weight": "model-00007-of-00008.safetensors", + "model.vision.transformer.layers.7.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.7.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.8.post_attention_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.attention.dense.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.attention.dense.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.attention.query_key_value.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.attention.query_key_value.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.input_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.input_layernorm.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.mlp.fc1.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.mlp.fc1.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.mlp.fc2.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.mlp.fc2.weight": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.post_attention_layernorm.bias": "model-00006-of-00008.safetensors", + "model.vision.transformer.layers.9.post_attention_layernorm.weight": "model-00006-of-00008.safetensors" + } +} diff --git a/modeling_cogvlm.py b/modeling_cogvlm.py new file mode 100644 index 0000000000000000000000000000000000000000..30ef13d6eec5fe1833a9aec550128d222a57a213 --- /dev/null +++ b/modeling_cogvlm.py @@ -0,0 +1,785 @@ +"""largely copy from llama and adapt for cogvlm""" +import warnings +from typing import TYPE_CHECKING, Optional, Tuple, List, Union, Literal, Dict, Any + +import math +import torch +from torch import nn +from torch.nn import CrossEntropyLoss +from torchvision import transforms +from einops import rearrange + +from transformers import PreTrainedModel, PreTrainedTokenizer +from transformers.utils.logging import get_logger +from transformers.activations import ACT2FN +from transformers.modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast + +from .configuration_cogvlm import CogVLMConfig +from .util import FastRotaryEmbedding +from .visual import EVA2CLIPModel + +if TYPE_CHECKING: + from transformers.utils import ModelOutput + +logger = get_logger(__name__) + +LANGUAGE_TOKEN_TYPE = 0 +VISION_TOKEN_TYPE = 1 + + +# Copied from transformers.models.bart.modeling_bart._make_causal_mask +def _make_causal_mask( + input_ids_shape: torch.Size, dtype: torch.dtype, device: torch.device, past_key_values_length: int = 0 +): + """ + Make causal mask used for bi-directional self-attention. + """ + bsz, tgt_len = input_ids_shape + mask = torch.full((tgt_len, tgt_len), torch.finfo(dtype).min, device=device) + mask_cond = torch.arange(mask.size(-1), device=device) + mask.masked_fill_(mask_cond < (mask_cond + 1).view(mask.size(-1), 1), 0) + mask = mask.to(dtype) + + if past_key_values_length > 0: + mask = torch.cat([torch.zeros(tgt_len, past_key_values_length, dtype=dtype, device=device), mask], dim=-1) + return mask[None, None, :, :].expand(bsz, 1, tgt_len, tgt_len + past_key_values_length) + + +# Copied from transformers.models.bart.modeling_bart._expand_mask +def _expand_mask(mask: torch.Tensor, dtype: torch.dtype, tgt_len: Optional[int] = None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill(inverted_mask.to(torch.bool), torch.finfo(dtype).min) + + +class RMSNorm(nn.Module): + def __init__(self, hidden_size, eps=1e-6): + super().__init__() + self.weight = nn.Parameter(torch.ones(hidden_size)) + self.variance_epsilon = eps + + def forward(self, hidden_states): + input_dtype = hidden_states.dtype + hidden_states = hidden_states.to(torch.float32) + variance = hidden_states.pow(2).mean(-1, keepdim=True) + hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon) + return (self.weight * hidden_states).to(input_dtype) + + +class MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, x): + down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) + return down_proj + + +def get_expert_mask(token_type_ids: "torch.LongTensor(B, L)") -> "[torch.BoolTensor(B, L), torch.BoolTensor(B, L)]": + vision_token_mask = torch.zeros_like(token_type_ids, dtype=torch.bool) + vision_token_mask[:, :-1] = (token_type_ids[:, :-1] == VISION_TOKEN_TYPE) & (token_type_ids[:, 1:] == VISION_TOKEN_TYPE) + language_token_mask = ~vision_token_mask + return vision_token_mask, language_token_mask + + +class VisionExpertMLP(nn.Module): + def __init__(self, config): + super().__init__() + self.language_mlp = MLP(config) + self.vision_mlp = MLP(config) + + def forward(self, hidden_states: "torch.Tensor(B, L, D)", token_type_ids: "torch.LongTensor(B, L)"): + output = torch.empty(hidden_states.shape, dtype=hidden_states.dtype, device=hidden_states.device) + vision_token_mask, language_token_mask = get_expert_mask(token_type_ids) + output[vision_token_mask] = self.vision_mlp(hidden_states[vision_token_mask]) + output[language_token_mask] = self.language_mlp(hidden_states[language_token_mask]) + return output + + +def attention_fn( + query_layer: "torch.tensor(B, H, L, HD)", + key_layer: "torch.tensor(B, H, L, HD)", + value_layer: "torch.tensor(B, H, L, HD)", + attention_mask: "torch.tensor(B, H, L, HD)", + *, + scaling_attention_score: bool = True, + attention_dropout: nn.Module = None +): + attention_mask_bool = (attention_mask == 0) + is_low_triangle = (attention_mask_bool == torch.ones_like(attention_mask_bool, dtype=torch.float).tril()).all() + is_full = (attention_mask_bool > 0).all() + if not (int(torch.__version__.split('.')[0]) >= 2): + warnings.warn("It's recommended to use torch2.0 or higher.") + if int(torch.__version__.split('.')[0]) >= 2 and scaling_attention_score and (is_full or is_low_triangle): + dropout_p = 0. if attention_dropout is None or not attention_dropout.training else attention_dropout.p + return torch.nn.functional.scaled_dot_product_attention( + query_layer, key_layer, value_layer, + attn_mask=None, + dropout_p=dropout_p, + is_causal=not is_full + ) + else: + if scaling_attention_score: + query_layer = query_layer / math.sqrt(query_layer.shape[-1]) + attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2)) + attention_scores = attention_scores + attention_mask + attention_scores = nn.functional.softmax(attention_scores, dim=-1, dtype=torch.float32).to(query_layer.dtype) + if attention_dropout is not None: + attention_scores = attention_dropout(attention_scores) + context_layer = torch.matmul(attention_scores, value_layer) + return context_layer + + +class VisionExpertAttention(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.num_heads = config.num_attention_heads + self.head_dim = self.hidden_size // self.num_heads + self.max_position_embeddings = config.max_position_embeddings + + # self.rotary_emb = RotaryEmbedding(self.hidden_size // self.num_heads) + self.rotary_emb = FastRotaryEmbedding(dim=self.head_dim, pos_idx_in_fp32=False) + self.vision_expert_query_key_value = nn.Linear(self.hidden_size, self.hidden_size * 3, bias=False) + self.vision_expert_dense = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + self.language_expert_query_key_value = nn.Linear(self.hidden_size, self.hidden_size * 3, bias=False) + self.language_expert_dense = nn.Linear(self.hidden_size, self.hidden_size, bias=False) + + def _transpose_for_scores(self, tensor): + """Transpose a 3D tensor [B, L, H*HD] into a 4D tensor with size [B H L HD].""" + new_tensor_shape = tensor.size()[:-1] + (self.num_heads, self.head_dim) + tensor = tensor.view(*new_tensor_shape) + return tensor.permute(0, 2, 1, 3) + + def forward( + self, + hidden_states: torch.Tensor, + token_type_ids: torch.LongTensor, + position_ids: torch.LongTensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, + ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + vision_token_mask, language_token_mask = get_expert_mask(token_type_ids) + + shape = list(hidden_states.shape) + shape[-1] = shape[-1] * 3 + mixed_raw_layer = torch.empty(shape, dtype=hidden_states.dtype, device=hidden_states.device) + mixed_raw_layer[vision_token_mask] = self.vision_expert_query_key_value(hidden_states[vision_token_mask]) + mixed_raw_layer[language_token_mask] = self.language_expert_query_key_value(hidden_states[language_token_mask]) + + query_states, key_states, value_states = torch.split(mixed_raw_layer, self.hidden_size, dim=-1) + query_states = self._transpose_for_scores(query_states) # B, H, L, HD + key_states = self._transpose_for_scores(key_states) # B, H, L, HD + value_states = self._transpose_for_scores(value_states) # B, H, L, HD + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + query_states, key_states = self.rotary_emb(query_states, key_states, position_ids=position_ids, max_seqlen=position_ids.max() + 1) + + if past_key_value is not None: + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + context_layer = attention_fn( + query_layer=query_states, key_layer=key_states, value_layer=value_states, attention_mask=attention_mask, + scaling_attention_score=True, attention_dropout=None) + if context_layer.size() != (bsz, self.num_heads, q_len, self.head_dim): + raise ValueError( + f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is" + f" {context_layer.size()}" + ) + context_layer = context_layer.transpose(1, 2).contiguous().reshape(bsz, q_len, self.hidden_size) + + attn_output = torch.empty(context_layer.shape, dtype=hidden_states.dtype, device=hidden_states.device) + attn_output[vision_token_mask] = self.vision_expert_dense(context_layer[vision_token_mask]) + attn_output[language_token_mask] = self.language_expert_dense(context_layer[language_token_mask]) + + if output_attentions: + warnings.warn("output_attentions is not implemented.") + + return attn_output, None, past_key_value + + +class CogVLMDecoderLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.hidden_size = config.hidden_size + self.self_attn = VisionExpertAttention(config=config) + self.mlp = VisionExpertMLP(config) + self.input_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.post_attention_layernorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + def forward( + self, + hidden_states: torch.Tensor, + token_type_ids: torch.LongTensor, + position_ids: torch.LongTensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: Optional[bool] = False, + use_cache: Optional[bool] = False, + ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]: + residual = hidden_states + + hidden_states = self.input_layernorm(hidden_states) + + # Self Attention + hidden_states, self_attn_weights, present_key_value = self.self_attn( + hidden_states=hidden_states, + token_type_ids=token_type_ids, + position_ids=position_ids, + attention_mask=attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + hidden_states = self.mlp(hidden_states, token_type_ids=token_type_ids) + hidden_states = residual + hidden_states + + outputs = (hidden_states,) + + if output_attentions: + outputs += (self_attn_weights,) + + if use_cache: + outputs += (present_key_value,) + + return outputs # type: ignore + + +class CogVLMPreTrainedModel(PreTrainedModel): + config_class = CogVLMConfig + base_model_prefix = "model" + supports_gradient_checkpointing = False + _no_split_modules = ["CogVLMDecoderLayer"] + _skip_keys_device_placement = "past_key_values" + + def _init_weights(self, module): + std = self.config.initializer_range + if isinstance(module, nn.Linear): + module.weight.data.normal_(mean=0.0, std=std) + if module.bias is not None: + module.bias.data.zero_() + elif isinstance(module, nn.Embedding): + module.weight.data.normal_(mean=0.0, std=std) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + + +def is_empty(images_list: Optional[List[List[torch.Tensor]]]): + if images_list is None or len(images_list) == 0: + return True + for image_list in images_list: + if len(image_list): + return False + return True + + +def build_position_ids(x: "torch.BoolTensor(B, L)", attention_mask: Optional["torch.BoolTensor(B, L)"] = None) -> "torch.LongTensor(B, L)": + if attention_mask is not None: + tmp = x.clone() + tmp[~(attention_mask.bool())] = -1 + else: + tmp = x.clone() + # image boi eoi token as LANGUAGE_TOKEN_TYPE + is_boi_eoi = torch.zeros_like(x, dtype=torch.bool) + is_boi_eoi[:, 1:] |= (tmp[:, 1:] == VISION_TOKEN_TYPE) & (tmp[:, :-1] == LANGUAGE_TOKEN_TYPE) + is_boi_eoi[:, 0] |= (tmp[:, 0] == VISION_TOKEN_TYPE) + is_boi_eoi[:, :-1] |= (tmp[:, :-1] == VISION_TOKEN_TYPE) & (tmp[:, 1:] == LANGUAGE_TOKEN_TYPE) + is_boi_eoi[:, -1] |= (tmp[:, -1] == VISION_TOKEN_TYPE) + tmp[is_boi_eoi] = LANGUAGE_TOKEN_TYPE + # final position ids + y = torch.zeros_like(x, dtype=torch.long) + y[:, 1:] = (tmp[:, 1:] == LANGUAGE_TOKEN_TYPE) | ((tmp[:, 1:] == VISION_TOKEN_TYPE) & (tmp[:, :-1] == LANGUAGE_TOKEN_TYPE)) + y = y.cumsum(dim=-1) + return y + + +class CogVLMModel(CogVLMPreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.padding_idx = config.pad_token_id + self.vocab_size = config.vocab_size + + self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx) + self.layers = nn.ModuleList([CogVLMDecoderLayer(config) for _ in range(config.num_hidden_layers)]) + self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + + self.vision = EVA2CLIPModel(config) + + self.gradient_checkpointing = False + # Initialize weights and apply final processing + self.post_init() + + def encode_images(self, images: List[List[torch.Tensor]]) -> torch.Tensor: + images_list, images = images, [] + + images = [] + for image_list in images_list: + for image in image_list: + images.append(image) + + images = torch.stack(images) + images_features = self.vision(images) + return images_features + + def forward( + self, + input_ids: torch.LongTensor = None, + images: List[List[torch.Tensor]] = None, + token_type_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + """take care of image_encode, token_type_ids, position_ids and (attention_mask = None is fine)""" + + if past_key_values is not None: + pass # generate mode with past_key_values. the image features are already mapped + else: + # not allow for inputs_embeds, because we want to process image feature + assert input_ids is not None and inputs_embeds is None, f"{input_ids} {inputs_embeds}" + if not is_empty(images): # multi-modality + assert token_type_ids is not None, f"multi-modality requires `token_type_ids`!" + assert len(input_ids) == len(images), f"{len(input_ids)} {len(images)}" + inputs_embeds = self.embed_tokens(input_ids) + images_features = self.encode_images(images) + images_features = rearrange(images_features, 'b n d -> (b n) d') + images_features = images_features.to(dtype=inputs_embeds.dtype, device=inputs_embeds.device) + inputs_embeds = inputs_embeds.index_put([token_type_ids == VISION_TOKEN_TYPE], images_features) + else: # single-modality + if token_type_ids is None: + token_type_ids = torch.ones_like(input_ids, dtype=torch.long, device=input_ids.device) * LANGUAGE_TOKEN_TYPE + assert not (token_type_ids == VISION_TOKEN_TYPE).any(), f"{(token_type_ids == VISION_TOKEN_TYPE).sum()}" + inputs_embeds = self.embed_tokens(input_ids) + + if position_ids is None: + position_ids = build_position_ids(token_type_ids, attention_mask) + input_ids = None + + return self.llm_forward( + input_ids=input_ids, + token_type_ids=token_type_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + def llm_forward( + self, + input_ids: torch.LongTensor = None, + token_type_ids: torch.LongTensor = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + ) -> Union[Tuple, BaseModelOutputWithPast]: + """largely copy from llama forward and adapt for cogvlm with `token_type_ids`""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + use_cache = use_cache if use_cache is not None else self.config.use_cache + + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # retrieve input_ids and inputs_embeds + if input_ids is not None and inputs_embeds is not None: + raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time") + elif input_ids is not None: + batch_size, seq_length = input_ids.shape + elif inputs_embeds is not None: + batch_size, seq_length, _ = inputs_embeds.shape + else: + raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds") + + seq_length_with_past = seq_length + past_key_values_length = 0 + + if past_key_values is not None: + past_key_values_length = past_key_values[0][0].shape[2] + seq_length_with_past = seq_length_with_past + past_key_values_length + + if position_ids is None: + device = input_ids.device if input_ids is not None else inputs_embeds.device + position_ids = torch.arange( + past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device + ) + position_ids = position_ids.unsqueeze(0).view(-1, seq_length) + else: + position_ids = position_ids.view(-1, seq_length).long() + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + # embed positions + if attention_mask is None: + attention_mask = torch.ones( + (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device + ) + attention_mask = self._prepare_decoder_attention_mask( + attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length + ) + + hidden_states = inputs_embeds + + # decoder layers + all_hidden_states = () if output_hidden_states else None + all_self_attns = () if output_attentions else None + next_decoder_cache = () if use_cache else None + + for idx, decoder_layer in enumerate(self.layers): + if output_hidden_states: + all_hidden_states += (hidden_states,) + + past_key_value = past_key_values[idx] if past_key_values is not None else None + layer_outputs = decoder_layer( + hidden_states, + token_type_ids=token_type_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_value=past_key_value, + output_attentions=output_attentions, + use_cache=use_cache, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[2 if output_attentions else 1],) + + if output_attentions: + all_self_attns += (layer_outputs[1],) + + hidden_states = self.norm(hidden_states) + + # add hidden states from the last decoder layer + if output_hidden_states: + all_hidden_states += (hidden_states,) + + next_cache = next_decoder_cache if use_cache else None + if not return_dict: + return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None) + return BaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=next_cache, + hidden_states=all_hidden_states, + attentions=all_self_attns, + ) + + def get_input_embeddings(self): + return self.embed_tokens + + def set_input_embeddings(self, value): + self.embed_tokens = value + + # noinspection PyMethodMayBeStatic + # Copied from transformers.models.bart.modeling_bart.BartDecoder._prepare_decoder_attention_mask + def _prepare_decoder_attention_mask(self, attention_mask, input_shape, inputs_embeds, past_key_values_length): + # create causal mask + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + combined_attention_mask = None + if input_shape[-1] > 1: + combined_attention_mask = _make_causal_mask( + input_shape, + inputs_embeds.dtype, + device=inputs_embeds.device, + past_key_values_length=past_key_values_length, + ) + + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + expanded_attn_mask = _expand_mask(attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]).to( + inputs_embeds.device + ) + combined_attention_mask = ( + expanded_attn_mask if combined_attention_mask is None else expanded_attn_mask + combined_attention_mask + ) + + return combined_attention_mask + + +def chat_history_to_prompt(history, query): + prompt = " [INST] " + for i, (old_query, response) in enumerate(history): + prompt += old_query + " [/INST] " + response + " [INST] " + prompt += query + " [/INST] " + return prompt + + +def base_history_to_prompt(history, query): + prompt = query + return prompt + + +_history_to_prompt = { + "base": base_history_to_prompt, + "chat": chat_history_to_prompt +} + + +class CogVLMForCausalLM(CogVLMPreTrainedModel): + _auto_class = "AutoModelForCausalLM" + + def __init__(self, config): + super().__init__(config) + self.model = CogVLMModel(config) + self.vocab_size = config.vocab_size + self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False) + + # Initialize weights and apply final processing + self.post_init() + + def get_input_embeddings(self): + return self.model.embed_tokens + + def set_input_embeddings(self, value): + self.model.embed_tokens = value + + def get_output_embeddings(self): + return self.lm_head + + def set_output_embeddings(self, new_embeddings): + self.lm_head = new_embeddings + + def set_decoder(self, decoder): + self.model = decoder + + def get_decoder(self): + return self.model + + def forward( + self, + input_ids: torch.LongTensor = None, + images: List[List[torch.Tensor]] = None, + token_type_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[torch.LongTensor] = None, + ) -> Union[Tuple, CausalLMOutputWithPast]: + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + images=images, + token_type_ids=token_type_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + + hidden_states = outputs[0] + logits = self.lm_head(hidden_states) + logits = logits.float() + + loss = None + if labels is not None: + # Shift so that tokens < n predict n + shift_logits = logits[..., :-1, :].contiguous() + shift_labels = labels[..., 1:].contiguous() + # Flatten the tokens + loss_fct = CrossEntropyLoss() + shift_logits = shift_logits.view(-1, self.config.vocab_size) + shift_labels = shift_labels.view(-1) + # Enable model parallelism + shift_labels = shift_labels.to(shift_logits.device) + loss = loss_fct(shift_logits, shift_labels) + + if not return_dict: + output = (logits,) + outputs[1:] + return (loss,) + output if loss is not None else output + + return CausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def _prepare_attention_mask_for_generation( + self, + inputs: torch.Tensor, + pad_token_id: Optional[int], + eos_token_id: Optional[Union[int, List[int]]], + ) -> torch.LongTensor: + return torch.ones(inputs.shape[:2], dtype=torch.long, device=inputs.device) # type: ignore + + def prepare_inputs_for_generation( + self, input_ids, token_type_ids, images=None, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs + ): + # build position_ids if needed + position_ids = kwargs.get("position_ids", None) + if position_ids is None: + position_ids = build_position_ids(token_type_ids, attention_mask) + + if past_key_values: + input_ids = input_ids[:, -1:] + token_type_ids = token_type_ids[:, -1:] + position_ids = position_ids[:, -1:] + + # if `inputs_embeds` are passed, we only want to use them in the 1st generation step + if inputs_embeds is not None and past_key_values is None: + model_inputs = {"inputs_embeds": inputs_embeds} + else: + model_inputs = {"input_ids": input_ids} + + model_inputs.update( + { + "token_type_ids": token_type_ids, + "images": images, + "position_ids": position_ids, + "past_key_values": past_key_values, + "use_cache": kwargs.get("use_cache"), + "attention_mask": attention_mask, + } + ) + return model_inputs + + def _update_model_kwargs_for_generation( + self, + outputs: "ModelOutput", + model_kwargs: Dict[str, Any], + is_encoder_decoder: bool = False, + standardize_cache_format: bool = False, + ) -> Dict[str, Any]: + # update past_key_values + model_kwargs["past_key_values"] = self._extract_past_from_model_output( + outputs, standardize_cache_format=standardize_cache_format + ) + if getattr(outputs, "state", None) is not None: + model_kwargs["state"] = outputs.state + + # update token_type_ids with last value + if "token_type_ids" in model_kwargs: + token_type_ids = model_kwargs["token_type_ids"] + new_token_type_ids = torch.ones(size=(token_type_ids.shape[0], 1), dtype=token_type_ids.dtype, device=token_type_ids.device) * LANGUAGE_TOKEN_TYPE + model_kwargs["token_type_ids"] = torch.cat([token_type_ids, new_token_type_ids], dim=-1) + + if not is_encoder_decoder: + # update attention mask + if "attention_mask" in model_kwargs: + attention_mask = model_kwargs["attention_mask"] + model_kwargs["attention_mask"] = torch.cat( + [attention_mask, attention_mask.new_ones((attention_mask.shape[0], 1))], dim=-1 + ) + else: + # update decoder attention mask + if "decoder_attention_mask" in model_kwargs: + decoder_attention_mask = model_kwargs["decoder_attention_mask"] + model_kwargs["decoder_attention_mask"] = torch.cat( + [decoder_attention_mask, decoder_attention_mask.new_ones((decoder_attention_mask.shape[0], 1))], + dim=-1, + ) + + return model_kwargs + + def _reorder_cache(self, past_key_values, beam_idx): + reordered_past = () + for layer_past in past_key_values: + reordered_past += ( + tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past), + ) + return reordered_past + + def build_conversation_input_ids( + self, + tokenizer: "PreTrainedTokenizer", + *, + query: str, + history: Optional[List[Tuple[str, str]]] = None, + images: Optional[List["PIL.Image"]] = None, + template_version: Optional[Literal["base", "chat"]] = None, + ): + image_size: int = self.config.vision_config['image_size'] + patch_size: int = self.config.vision_config['patch_size'] + template_version = template_version or self.config.template_version + assert images is None or len(images) <= 1, f"not support multi images by now." + history = history or [] + text = _history_to_prompt[template_version](history, query) + + input_ids = [tokenizer.bos_token_id] + token_type_ids = [LANGUAGE_TOKEN_TYPE] + if images is not None and len(images) == 1: + # vision + transform = transforms.Compose( + [ + transforms.Resize( + (image_size, image_size), interpolation=transforms.InterpolationMode.BICUBIC + ), + transforms.ToTensor(), + transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), + ] + ) + images = [transform(images[0])] + # language + vision_token_num = (image_size // patch_size) * (image_size // patch_size) + 2 + input_ids += [tokenizer.pad_token_id] * vision_token_num + token_type_ids += [VISION_TOKEN_TYPE] * vision_token_num + text_ids = tokenizer.encode(text, add_special_tokens=False) + + input_ids += text_ids + token_type_ids += [LANGUAGE_TOKEN_TYPE] * len(text_ids) + attention_mask = [1] * len(input_ids) + + return { + 'input_ids': torch.tensor(input_ids, dtype=torch.long), + 'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long), + 'attention_mask': torch.tensor(attention_mask, dtype=torch.long), + 'images': images, + } diff --git a/util.py b/util.py new file mode 100644 index 0000000000000000000000000000000000000000..1dccacad2ded4c357ab9cb23d04027500256e281 --- /dev/null +++ b/util.py @@ -0,0 +1,483 @@ +from typing import Optional, Tuple, Union + +import torch +from einops import rearrange, repeat +import torch.nn.functional as F + +import triton +import triton.language as tl + + +# @triton.autotune( +# configs=[ +# triton.Config({"BLOCK_M": 2}), +# triton.Config({"BLOCK_M": 4}), +# triton.Config({"BLOCK_M": 8}), +# triton.Config({"BLOCK_M": 16}), +# ], +# key=["CACHE_KEY_SEQLEN", "BLOCK_K", "INTERLEAVED"], +# ) +@triton.jit +def rotary_kernel( + OUT, # Pointers to matrices + X, + COS, + SIN, + CU_SEQLENS, + SEQLEN_OFFSETS, # this could be int or a pointer + # Matrix dimensions + seqlen, + nheads, + rotary_dim, + seqlen_ro, + CACHE_KEY_SEQLEN, + # strides + stride_out_batch, + stride_out_nheads, + stride_out_seqlen, + stride_out_headdim, + stride_x_batch, + stride_x_nheads, + stride_x_seqlen, + stride_x_headdim, + # Meta-parameters + BLOCK_K: tl.constexpr, + IS_SEQLEN_OFFSETS_TENSOR: tl.constexpr, + IS_VARLEN: tl.constexpr, + INTERLEAVED: tl.constexpr, + CONJUGATE: tl.constexpr, + BLOCK_M: tl.constexpr, +): + pid_m = tl.program_id(axis=0) + pid_batch = tl.program_id(axis=1) + pid_head = tl.program_id(axis=2) + rotary_dim_half = rotary_dim // 2 + + if not IS_VARLEN: + X = X + pid_batch * stride_x_batch + pid_head * stride_x_nheads + OUT = OUT + pid_batch * stride_out_batch + pid_head * stride_out_nheads + COS = COS + pid_batch * seqlen_ro * rotary_dim_half + SIN = SIN + pid_batch * seqlen_ro * rotary_dim_half + else: + start_idx = tl.load(CU_SEQLENS + pid_batch) + seqlen = tl.load(CU_SEQLENS + pid_batch + 1) - start_idx + X = X + start_idx * stride_x_seqlen + pid_head * stride_x_nheads + OUT = OUT + start_idx * stride_out_seqlen + pid_head * stride_out_nheads + + if pid_m * BLOCK_M >= seqlen: + return + rm = pid_m * BLOCK_M + tl.arange(0, BLOCK_M) + if not IS_SEQLEN_OFFSETS_TENSOR: + rm_cs = rm + SEQLEN_OFFSETS + else: + rm_cs = rm + tl.load(SEQLEN_OFFSETS + pid_batch) + rk = tl.arange(0, BLOCK_K) + rk_half = tl.arange(0, BLOCK_K // 2) + + if not INTERLEAVED: + # Load the 1st and 2nd halves of X, do calculation, then store to 1st and 2nd halves of OUT + X = X + (rm[:, None] * stride_x_seqlen + rk_half[None, :] * stride_x_headdim) + COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :]) + SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_half[None, :]) + cos = tl.load( + COS, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=1.0 + ) + sin = tl.load( + SIN, mask=(rm_cs[:, None] < seqlen_ro) & (rk_half[None, :] < rotary_dim_half), other=0.0 + ) + x0 = tl.load( + X, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), other=0.0 + ) + x1 = tl.load( + X + rotary_dim_half * stride_x_headdim, + mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), + other=0.0, + ) + if CONJUGATE: + sin = -sin + o0 = x0 * cos - x1 * sin + o1 = x0 * sin + x1 * cos + # write back result + OUT = OUT + (rm[:, None] * stride_out_seqlen + rk_half[None, :] * stride_out_headdim) + tl.store(OUT, o0, mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half)) + tl.store( + OUT + rotary_dim_half * stride_out_headdim, + o1, + mask=(rm[:, None] < seqlen) & (rk_half[None, :] < rotary_dim_half), + ) + else: + # We don't want to load X[0, 2, 4, ...] and X[1, 3, 5, ...] separately since both are slow. + # Instead, we load x0 = X[0, 1, 2, 3, ...] and x1 = X[1, 0, 3, 2, ...]. + # Loading x0 will be fast but x1 will be slow. + # Then we load cos = COS[0, 0, 1, 1, ...] and sin = SIN[0, 0, 1, 1, ...]. + # Then we do the calculation and use tl.where to pick put the right outputs for the even + # and for the odd indices. + rk_swap = rk + ((rk + 1) % 2) * 2 - 1 # 1, 0, 3, 2, 5, 4, ... + rk_repeat = tl.arange(0, BLOCK_K) // 2 + X0 = X + (rm[:, None] * stride_x_seqlen + rk[None, :] * stride_x_headdim) + X1 = X + (rm[:, None] * stride_x_seqlen + rk_swap[None, :] * stride_x_headdim) + COS = COS + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :]) + SIN = SIN + (rm_cs[:, None] * rotary_dim_half + rk_repeat[None, :]) + cos = tl.load( + COS, + mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half), + other=1.0, + ).to(tl.float32) + sin = tl.load( + SIN, + mask=(rm_cs[:, None] < seqlen_ro) & (rk_repeat[None, :] < rotary_dim_half), + other=0.0, + ).to(tl.float32) + x0 = tl.load(X0, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim), other=0.0).to( + tl.float32 + ) + x1 = tl.load( + X1, mask=(rm[:, None] < seqlen) & (rk_swap[None, :] < rotary_dim), other=0.0 + ).to(tl.float32) + if CONJUGATE: + sin = -sin + x0_cos = x0 * cos + x1_sin = x1 * sin + out = tl.where(rk[None, :] % 2 == 0, x0_cos - x1_sin, x0_cos + x1_sin) + OUT = OUT + (rm[:, None] * stride_out_seqlen + rk[None, :] * stride_out_headdim) + tl.store(OUT, out, mask=(rm[:, None] < seqlen) & (rk[None, :] < rotary_dim)) + + +def apply_rotary( + x: torch.Tensor, + cos: torch.Tensor, + sin: torch.Tensor, + seqlen_offsets: Union[int, torch.Tensor] = 0, + cu_seqlens: Optional[torch.Tensor] = None, + max_seqlen: Optional[int] = None, + interleaved=False, + inplace=False, + conjugate=False, +) -> torch.Tensor: + """ + Arguments: + x: (batch, seqlen, nheads, headdim) if cu_seqlens is None + else (total_seqlen, nheads, headdim). + cos: (seqlen_ro, rotary_dim / 2) + sin: (seqlen_ro, rotary_dim / 2) + seqlen_offsets: integer or integer tensor of size (batch,) + cu_seqlens: (batch + 1,) or None + max_seqlen: int + Returns: + y: (batch, seqlen, nheads, headdim) + """ + + batch, nheads, seqlen, headdim = x.shape + + batch_ro, seqlen_ro, rotary_dim = cos.shape + + assert batch == batch_ro + assert sin.shape == cos.shape + rotary_dim *= 2 + assert rotary_dim <= headdim, "rotary_dim must be <= headdim" + assert headdim <= 256, "Only support headdim <= 256" + + assert seqlen_ro >= seqlen, "seqlen_ro must be >= seqlen" + + assert ( + cos.dtype == sin.dtype + ), f"cos and sin must have the same dtype, got {cos.dtype} and {sin.dtype}" + assert ( + x.dtype == cos.dtype + ), f"Input and cos/sin must have the same dtype, got {x.dtype} and {cos.dtype}" + + cos, sin = cos.contiguous(), sin.contiguous() + if isinstance(seqlen_offsets, torch.Tensor): + assert seqlen_offsets.shape == (batch,) + assert seqlen_offsets.dtype in [torch.int32, torch.int64] + seqlen_offsets = seqlen_offsets.contiguous() + else: + assert seqlen_offsets + seqlen <= seqlen_ro + + output = torch.empty_like(x) if not inplace else x + if rotary_dim < headdim and not inplace: + output[..., rotary_dim:].copy_(x[..., rotary_dim:]) + + BLOCK_K = ( + 32 + if rotary_dim <= 32 + else (64 if rotary_dim <= 64 else (128 if rotary_dim <= 128 else 256)) + ) + grid = lambda META: (triton.cdiv(seqlen, META["BLOCK_M"]), batch, nheads) # noqa + BLOCK_M = 4 if interleaved else (8 if rotary_dim <= 64 else 4) + + # Need this, otherwise Triton tries to launch from cuda:0 and we get + # ValueError: Pointer argument (at 0) cannot be accessed from Triton (cpu tensor?) + with torch.cuda.device(x.device.index): + rotary_kernel[grid]( + output, # data ptrs + x, + cos, + sin, + cu_seqlens, + seqlen_offsets, + seqlen, # shapes + nheads, + rotary_dim, + seqlen_ro, + seqlen // 128, # key for triton cache (limit number of compilations) + output.stride(0), # batch_strides + output.stride(-3), # nheads_stride + output.stride(-2), # seqlen_stride + output.stride(-1), # headdim_stride + x.stride(0), # batch_strides + x.stride(-3), # nheads stride + x.stride(-2), # seqlen stride + x.stride(-1), # headdim stride + BLOCK_K, + isinstance(seqlen_offsets, torch.Tensor), + False, + interleaved, + conjugate, + BLOCK_M, + ) + return output + + +class ApplyRotaryEmb(torch.autograd.Function): + @staticmethod + def forward( + ctx, + x, + cos, + sin, + interleaved=False, + inplace=False, + seqlen_offsets: Union[int, torch.Tensor] = 0, + cu_seqlens: Optional[torch.Tensor] = None, + max_seqlen: Optional[int] = None, + ): + out = apply_rotary( + x, + cos, + sin, + seqlen_offsets=seqlen_offsets, + cu_seqlens=cu_seqlens, + max_seqlen=max_seqlen, + interleaved=interleaved, + inplace=inplace, + ) + if isinstance(seqlen_offsets, int): + ctx.save_for_backward(cos, sin, cu_seqlens) # Can't save int with save_for_backward + ctx.seqlen_offsets = seqlen_offsets + else: + ctx.save_for_backward(cos, sin, cu_seqlens, seqlen_offsets) + ctx.seqlen_offsets = None + ctx.interleaved = interleaved + ctx.inplace = inplace + ctx.max_seqlen = max_seqlen + return out if not inplace else x + + @staticmethod + def backward(ctx, do): + seqlen_offsets = ctx.seqlen_offsets + if seqlen_offsets is None: + cos, sin, cu_seqlens, seqlen_offsets = ctx.saved_tensors + else: + cos, sin, cu_seqlens = ctx.saved_tensors + # TD [2023-09-02]: For some reason Triton (2.0.0.post1) errors with + # "[CUDA]: invalid device context", and cloning makes it work. Idk why. Triton 2.1.0 works. + if not ctx.interleaved and not ctx.inplace: + do = do.clone() + dx = apply_rotary( + do, + cos, + sin, + seqlen_offsets=seqlen_offsets, + cu_seqlens=cu_seqlens, + max_seqlen=ctx.max_seqlen, + interleaved=ctx.interleaved, + inplace=ctx.inplace, + conjugate=True, + ) + return dx, None, None, None, None, None, None, None + + +def apply_rotary_emb( + x, + cos, + sin, + interleaved=False, + inplace=False, + seqlen_offsets: Union[int, torch.Tensor] = 0, + cu_seqlens: Optional[torch.Tensor] = None, + max_seqlen: Optional[int] = None, +): + """ + Arguments: + x: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None + else (total_seqlen, nheads, headdim) + cos, sin: (seqlen_rotary, rotary_dim / 2) + interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead + of 1st half and 2nd half (GPT-NeoX style). + inplace: if True, apply rotary embedding in-place. + seqlen_offsets: (batch_size,) or int. Each sequence in x is shifted by this amount. + Most commonly used in inference when we have KV cache. + cu_seqlens: (batch + 1,) or None + max_seqlen: int + Return: + out: (batch_size, seqlen, nheads, headdim) if cu_seqlens is None + else (total_seqlen, nheads, headdim) + rotary_dim must be <= headdim + Apply rotary embedding to the first rotary_dim of x. + """ + return ApplyRotaryEmb.apply( + x, cos, sin, interleaved, inplace, seqlen_offsets, cu_seqlens, max_seqlen + ) + + +# For backward compatibility +apply_rotary_emb_func = apply_rotary_emb + + +class FastRotaryEmbedding(torch.nn.Module): + """ + The rotary position embeddings from RoFormer_ (Su et. al). + A crucial insight from the method is that the query and keys are + transformed by rotation matrices which depend on the relative positions. + + Other implementations are available in the Rotary Transformer repo_ and in + GPT-NeoX_, GPT-NeoX was an inspiration + + .. _RoFormer: https://arxiv.org/abs/2104.09864 + .. _repo: https://github.com/ZhuiyiTechnology/roformer + .. _GPT-NeoX: https://github.com/EleutherAI/gpt-neox + + If scale_base is not None, this implements XPos (Sun et al., https://arxiv.org/abs/2212.10554). + A recommended value for scale_base is 512: https://github.com/HazyResearch/flash-attention/issues/96 + Reference: https://github.com/sunyt32/torchscale/blob/main/torchscale/component/xpos_relative_position.py + """ + + def __init__( + self, + dim: int, + base=10000, + interleaved=False, + scale_base=None, + pos_idx_in_fp32=True, + device=None, + ): + """ + interleaved: if True, rotate pairs of even and odd dimensions (GPT-J style) instead + of 1st half and 2nd half (GPT-NeoX style). + pos_idx_in_fp32: if True, the position indices [0.0, ..., seqlen - 1] are in fp32, + otherwise they might be in lower precision. + This option was added because previously (before 2023-07-02), when we construct + the position indices, we use the dtype of self.inv_freq. In most cases this would + be fp32, but if the model is trained in pure bf16 (not mixed precision), then + self.inv_freq would be bf16, and the position indices are also in bf16. + Because of the limited precision of bf16 (e.g. 1995.0 is rounded to 2000.0), the + embeddings for some positions will coincide. + To maintain compatibility with models previously trained in pure bf16, + we add this option. + """ + super().__init__() + self.dim = dim + self.base = base + self.pos_idx_in_fp32 = pos_idx_in_fp32 + # Generate and save the inverse frequency buffer (non trainable) + inv_freq = self._compute_inv_freq(device) + self.register_buffer("inv_freq", inv_freq) + self.interleaved = interleaved + self.scale_base = scale_base + scale = ( + (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim) + if scale_base is not None + else None + ) + self.register_buffer("scale", scale, persistent=False) + + self._seq_len_cached = 0 + self._cos_cached = None + self._sin_cached = None + self._cos_k_cached = None + self._sin_k_cached = None + self.cos = None + self.sin = None + + def _compute_inv_freq(self, device=None): + return 1.0 / ( + self.base + ** (torch.arange(0, self.dim, 2, device=device) / self.dim) + # ** (torch.arange(0, self.dim, 2, device=device).float() / self.dim) + ) + + def _update_cos_sin_cache(self, seqlen, position_id, device=None, dtype=None): + + if ( + seqlen > self._seq_len_cached + ): + self._seq_len_cached = seqlen + # We want fp32 here, not self.inv_freq.dtype, since the model could be loaded in bf16 + # And the output of arange can be quite large, so bf16 would lose a lot of precision. + # However, for compatibility reason, we add an option to use the dtype of self.inv_freq. + if self.pos_idx_in_fp32: + t = torch.arange(seqlen, device=device, dtype=torch.float32) + # We want fp32 here as well since inv_freq will be multiplied with t, and the output + # will be large. Having it in bf16 will lose a lot of precision and cause the + # cos & sin output to change significantly. + # We want to recompute self.inv_freq if it was not loaded in fp32 + if self.inv_freq.dtype != torch.float32: + inv_freq = self._compute_inv_freq(device=device) + else: + inv_freq = self.inv_freq + else: + t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype) + inv_freq = self.inv_freq + freqs = torch.einsum("i,j->ij", t, inv_freq) + if self.scale is None: + self._cos_cached = torch.cos(freqs).to(dtype) + self._sin_cached = torch.sin(freqs).to(dtype) + + else: + power = ( + torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) + - seqlen // 2 + ) / self.scale_base + scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1") + # We want the multiplication by scale to happen in fp32 + self._cos_cached = (torch.cos(freqs) * scale).to(dtype) + self._sin_cached = (torch.sin(freqs) * scale).to(dtype) + self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype) + self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype) + + def forward( + self, + q: torch.Tensor, + k: torch.Tensor, + position_ids: torch.Tensor, + max_seqlen, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + q: (batch, nheads, seqlen, headdim) + k: (batch, nheads, seqlen, headdim) + position_id: (batch, seqlen) + max_seqlen: int + layer_id: int + only if layer_id == 0, then update cons and sin + Apply rotary embedding *inplace* to q k. + """ + + self._update_cos_sin_cache(max_seqlen, position_ids, device=q.device, dtype=q.dtype) + cos, sin = F.embedding(position_ids, self._cos_cached), F.embedding(position_ids, self._sin_cached) + + q = apply_rotary_emb_func( + q, + cos, + sin, + interleaved=self.interleaved, + inplace=True + ) + k = apply_rotary_emb_func( + k, + cos, + sin, + interleaved=self.interleaved, + inplace=True + ) + return q, k diff --git a/visual.py b/visual.py new file mode 100644 index 0000000000000000000000000000000000000000..367e2fb1d19ff3b5c43fb898b7d15e436bbbf7cc --- /dev/null +++ b/visual.py @@ -0,0 +1,135 @@ +import torch +from torch import nn +from argparse import Namespace +import xformers.ops as xops +from transformers.activations import ACT2FN + + +class PatchEmbedding(nn.Module): + def __init__(self, config): + super().__init__() + self.proj = nn.Conv2d(config.in_channels, config.hidden_size, kernel_size=config.patch_size, stride=config.patch_size) + self.cls_embedding = nn.Parameter(torch.zeros(1, config.hidden_size)) + self.position_embedding = nn.Embedding(config.num_positions, config.hidden_size) + + def forward(self, images: "tensor(B, C, H, W)") -> "tensor(B, L, D)": + x = self.proj(images) + x = x.flatten(2).transpose(1, 2) + cls_token = self.cls_embedding.expand(x.shape[0], -1, -1) + x = torch.cat((cls_token, x), dim=1) + x += self.position_embedding.weight.unsqueeze(0) + return x + + +class Attention(nn.Module): + def __init__(self, config): + super().__init__() + self.num_heads = config.num_heads + head_dim = config.hidden_size // config.num_heads + self.scale = head_dim ** -0.5 + self.query_key_value = nn.Linear(config.hidden_size, config.hidden_size * 3) + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.output_dropout = torch.nn.Dropout(config.dropout_prob) + + def forward(self, x: "tensor(B, L, D)") -> "tensor(B, L, D)": + B, L, _ = x.shape + qkv = self.query_key_value(x) + qkv = qkv.reshape(B, L, 3, self.num_heads, -1).permute(2, 0, 1, 3, 4) # 3, B, L, H, D + q, k, v = qkv[0], qkv[1], qkv[2] + + out = xops.memory_efficient_attention( + q, k, v, scale=self.scale, + ) + output = self.dense(out.view(B, L, -1)) + output = self.output_dropout(output) + return output + + def attention(self, q, k, v): + attn_weights = torch.matmul(q * self.scale, k.transpose(-2, -1)) + attn_weights = attn_weights.softmax(dim=-1) + output = torch.matmul(attn_weights, v) + return output + + +class MLP(nn.Module): + def __init__(self, config): + super().__init__() + self.config = config + self.activation_fn = ACT2FN[config.hidden_act] + self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) + self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.fc1(x) + x = self.activation_fn(x) + x = self.fc2(x) + return x + + +class TransformerLayer(nn.Module): + def __init__(self, config): + super().__init__() + self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + self.attention = Attention(config) + self.mlp = MLP(config) + self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) + + def forward(self, hidden_states): + attention_input = hidden_states + attention_output = self.input_layernorm(self.attention(attention_input)) + hidden_states = attention_input + attention_output + mlp_input = hidden_states + mlp_output = self.post_attention_layernorm(self.mlp(mlp_input)) + output = mlp_input + mlp_output + return output + + +class Transformer(nn.Module): + def __init__(self, config): + super().__init__() + self.layers = nn.ModuleList([TransformerLayer(config) for _ in range(config.num_hidden_layers)]) + + def forward(self, hidden_states): + for layer_module in self.layers: + hidden_states = layer_module(hidden_states) + return hidden_states + + +class GLU(nn.Module): + def __init__(self, config, in_features): + super().__init__() + self.linear_proj = nn.Linear(in_features, config.hidden_size, bias=False) + self.norm1 = nn.LayerNorm(config.hidden_size) + self.act1 = nn.GELU() + self.act2 = nn.functional.silu + self.dense_h_to_4h = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) + self.gate_proj = nn.Linear(config.hidden_size, config.intermediate_size, bias=False) + self.dense_4h_to_h = nn.Linear(config.intermediate_size, config.hidden_size, bias=False) + + def forward(self, x): + x = self.linear_proj(x) + x = self.act1(self.norm1(x)) + x = self.act2(self.gate_proj(x)) * self.dense_h_to_4h(x) + x = self.dense_4h_to_h(x) + return x + + +class EVA2CLIPModel(nn.Module): + def __init__(self, config): + super().__init__() + vision_config = Namespace(**config.vision_config) + self.patch_embedding = PatchEmbedding(vision_config) + self.transformer = Transformer(vision_config) + self.linear_proj = GLU(config, in_features=vision_config.hidden_size) + self.boi = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + self.eoi = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) + + def forward(self, images: "tensor(B, C, H, W)") -> "tensor(B, L, D)": + x = self.patch_embedding(images) + x = self.transformer(x) + x = x[:, 1:] + x = self.linear_proj(x) + boi = self.boi.expand(x.shape[0], -1, -1) + eoi = self.eoi.expand(x.shape[0], -1, -1) + x = torch.cat((boi, x, eoi), dim=1) + return x