Manli commited on
Commit
be3fa06
1 Parent(s): 9f3bae4

Repack the model w/ the new modeling file

Browse files
config.json CHANGED
@@ -3,7 +3,7 @@
3
  "XGenMMModelForConditionalGeneration"
4
  ],
5
  "auto_map": {
6
- "AutoConfig": "configuration_xgenmm.XGenMMConfig",
7
  "AutoModelForVision2Seq": "modeling_xgenmm.XGenMMModelForConditionalGeneration"
8
  },
9
  "model_type": "xgenmm",
 
3
  "XGenMMModelForConditionalGeneration"
4
  ],
5
  "auto_map": {
6
+ "AutoConfig": "modeling_xgenmm.XGenMMConfig",
7
  "AutoModelForVision2Seq": "modeling_xgenmm.XGenMMModelForConditionalGeneration"
8
  },
9
  "model_type": "xgenmm",
configuration_xgenmm.py DELETED
@@ -1,159 +0,0 @@
1
- from transformers import PretrainedConfig
2
- from transformers import logging
3
- from transformers import CONFIG_MAPPING
4
-
5
- logger = logging.get_logger(__name__)
6
-
7
- class XGenMMVisionEncoderConfig(PretrainedConfig):
8
- model_type = "xgenmm_vision_encoder"
9
-
10
- def __init__(self,
11
- model_name: str = 'google/siglip-so400m-patch14-384',
12
- anyres_grids: list[int] = [[384, 768],[768, 384],[768, 768],[1152, 384],[384,1152]],
13
- **kwargs):
14
- self.model_name = model_name
15
- self.anyres_grids = anyres_grids
16
- super().__init__(**kwargs)
17
-
18
-
19
- class XGenMMVisionTokenizerConfig(PretrainedConfig):
20
- model_type = "xgenmm_vision_tokenizer"
21
-
22
- def __init__(self,
23
- vis_feature_dim: int = 1152,
24
- lang_embedding_dim: int = 3072,
25
- num_vis_tokens: int = 128,
26
- image_aspect_ratio: str = 'anyres',
27
- **kwargs):
28
- self.vis_feature_dim = vis_feature_dim
29
- self.lang_embedding_dim = lang_embedding_dim
30
- self.num_vis_tokens = num_vis_tokens
31
- self.image_aspect_ratio = image_aspect_ratio
32
- super().__init__(**kwargs)
33
-
34
-
35
- class XGenMMConfig(PretrainedConfig):
36
- model_type = "xgenmm"
37
-
38
- def __init__(self,
39
- vision_encoder_config: dict = None,
40
- vision_tokenizer_config: dict = None,
41
- text_config: dict = None,
42
- **kwargs):
43
-
44
- if vision_encoder_config is None:
45
- vision_encoder_config = {'image_aspect_ratio': 'anyres', 'anyres_patch_sampling': True}
46
- logger.info("vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values.")
47
-
48
- if vision_tokenizer_config is None:
49
- vision_tokenizer_config = {}
50
- logger.info("vision_tokenizer_config is None. Initializing the XGenMMVisionTokenizerConfig with default values.")
51
-
52
- if text_config is None:
53
- text_config = {
54
- 'initial_tokenizer_len':32012,
55
- 'pad_token_id':32011,
56
- 'bos_token_id':1,
57
- 'eos_token_id':32000,
58
- 'vocab_size': 32064,
59
- 'hidden_size': 3072,
60
- 'intermediate_size': 8192,
61
- 'num_hidden_layers': 32,
62
- 'num_attention_heads': 32,
63
- 'num_key_value_heads': 32,
64
- 'resid_pdrop': 0.0,
65
- 'embd_pdrop': 0.0,
66
- 'attention_dropout': 0.0,
67
- 'hidden_act': 'silu',
68
- 'max_position_embeddings': 4096,
69
- 'original_max_position_embeddings': 4096,
70
- 'initializer_range': 0.02,
71
- 'rms_norm_eps': 1e-05,
72
- 'use_cache': True,
73
- 'rope_theta': 10000.0,
74
- 'rope_scaling': None,
75
- 'sliding_window': 2047,
76
- 'return_dict': True,
77
- 'output_hidden_states': False,
78
- 'output_attentions': False,
79
- 'torchscript': False,
80
- 'torch_dtype': 'bfloat16',
81
- 'use_bfloat16': False,
82
- 'tf_legacy_loss': False,
83
- 'pruned_heads': {},
84
- 'tie_word_embeddings': False,
85
- 'chunk_size_feed_forward': 0,
86
- 'is_encoder_decoder': False,
87
- 'is_decoder': False,
88
- 'cross_attention_hidden_size': None,
89
- 'add_cross_attention': False,
90
- 'tie_encoder_decoder': False,
91
- 'max_length': 20,
92
- 'min_length': 0,
93
- 'do_sample': False,
94
- 'early_stopping': False,
95
- 'num_beams': 1,
96
- 'num_beam_groups': 1,
97
- 'diversity_penalty': 0.0,
98
- 'temperature': 1.0,
99
- 'top_k': 50,
100
- 'top_p': 1.0,
101
- 'typical_p': 1.0,
102
- 'repetition_penalty': 1.0,
103
- 'length_penalty': 1.0,
104
- 'no_repeat_ngram_size': 0,
105
- 'encoder_no_repeat_ngram_size': 0,
106
- 'bad_words_ids': None,
107
- 'num_return_sequences': 1,
108
- 'output_scores': False,
109
- 'return_dict_in_generate': False,
110
- 'forced_bos_token_id': None,
111
- 'forced_eos_token_id': None,
112
- 'remove_invalid_values': False,
113
- 'exponential_decay_length_penalty': None,
114
- 'suppress_tokens': None,
115
- 'begin_suppress_tokens': None,
116
- 'finetuning_task': None,
117
- 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'},
118
- 'label2id': {'LABEL_0': 0, 'LABEL_1': 1},
119
- 'tokenizer_class': None,
120
- 'prefix': None,
121
- 'bos_token_id': 1,
122
- 'pad_token_id': 32000,
123
- 'eos_token_id': 32000,
124
- 'sep_token_id': None,
125
- 'decoder_start_token_id': None,
126
- 'task_specific_params': None,
127
- 'problem_type': None,
128
- 'model_type': 'phi3'
129
- }
130
- logger.info("text_config is None. Initializing the text config with default values (`Phi3Config`).")
131
-
132
- self.vision_encoder_config = XGenMMVisionEncoderConfig(**vision_encoder_config)
133
-
134
- self.vision_tokenizer_config = XGenMMVisionTokenizerConfig(**vision_tokenizer_config)
135
-
136
- text_model_type = text_config["model_type"] if "model_type" in text_config else "phi3"
137
- self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
138
-
139
- for key in ['initial_tokenizer_len', 'pad_token_id']:
140
- if key not in self.text_config.to_dict():
141
- raise ValueError(f"The key `{key}` is missing in the text_config.")
142
-
143
- super().__init__(**kwargs)
144
-
145
- @classmethod
146
- def from_vision_encoder_vision_tokenizer_text_configs(
147
- cls,
148
- vision_encoder_config: XGenMMVisionEncoderConfig,
149
- vision_tokenizer_config: XGenMMVisionTokenizerConfig,
150
- text_config: PretrainedConfig,
151
- **kwargs):
152
-
153
- return cls(
154
- vision_encoder_config=vision_encoder_config.to_dict(),
155
- vision_tokenizer_config=vision_tokenizer_config.to_dict(),
156
- text_config=text_config.to_dict(),
157
- **kwargs,
158
- )
159
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
image_processing_blip_3.py CHANGED
@@ -13,7 +13,18 @@ from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
13
  from transformers.image_utils import ImageInput
14
  from transformers.utils import TensorType
15
 
16
- from utils import expand2square
 
 
 
 
 
 
 
 
 
 
 
17
 
18
 
19
  class Blip3ImageProcessor(BaseImageProcessor):
 
13
  from transformers.image_utils import ImageInput
14
  from transformers.utils import TensorType
15
 
16
+ def expand2square(pil_img, background_color):
17
+ width, height = pil_img.size
18
+ if width == height:
19
+ return pil_img
20
+ elif width > height:
21
+ result = Image.new(pil_img.mode, (width, width), background_color)
22
+ result.paste(pil_img, (0, (width - height) // 2))
23
+ return result
24
+ else:
25
+ result = Image.new(pil_img.mode, (height, height), background_color)
26
+ result.paste(pil_img, ((height - width) // 2, 0))
27
+ return result
28
 
29
 
30
  class Blip3ImageProcessor(BaseImageProcessor):
modeling_xgenmm.py CHANGED
@@ -18,6 +18,173 @@ from transformers import PretrainedConfig, logging, CONFIG_MAPPING
18
  from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer
19
 
20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  def hasattr_recursive(obj, att):
22
  """
23
  Check if obj has nested attribute
@@ -1820,170 +1987,6 @@ class XGenMMPerceiver(VLMWithLanguageStream):
1820
  return output
1821
 
1822
 
1823
- class XGenMMVisionEncoderConfig(PretrainedConfig):
1824
- model_type = "xgenmm_vision_encoder"
1825
-
1826
- def __init__(
1827
- self,
1828
- model_name: str = "google/siglip-so400m-patch14-384",
1829
- anyres_grids: list[int] = [
1830
- [384, 768],
1831
- [768, 384],
1832
- [768, 768],
1833
- [1152, 384],
1834
- [384, 1152],
1835
- ],
1836
- **kwargs,
1837
- ):
1838
- self.model_name = model_name
1839
- self.anyres_grids = anyres_grids
1840
- super().__init__(**kwargs)
1841
-
1842
-
1843
- class XGenMMVisionTokenizerConfig(PretrainedConfig):
1844
- model_type = "xgenmm_vision_tokenizer"
1845
-
1846
- def __init__(
1847
- self,
1848
- vis_feature_dim: int = 1152,
1849
- lang_embedding_dim: int = 3072,
1850
- num_vis_tokens: int = 128,
1851
- image_aspect_ratio: str = "anyres",
1852
- **kwargs,
1853
- ):
1854
- self.vis_feature_dim = vis_feature_dim
1855
- self.lang_embedding_dim = lang_embedding_dim
1856
- self.num_vis_tokens = num_vis_tokens
1857
- self.image_aspect_ratio = image_aspect_ratio
1858
- super().__init__(**kwargs)
1859
-
1860
-
1861
- class XGenMMConfig(PretrainedConfig):
1862
- model_type = "xgenmm"
1863
-
1864
- def __init__(
1865
- self,
1866
- vision_encoder_config: dict = None,
1867
- vision_tokenizer_config: dict = None,
1868
- text_config: dict = None,
1869
- **kwargs,
1870
- ):
1871
-
1872
- if vision_encoder_config is None:
1873
- vision_encoder_config = {
1874
- "image_aspect_ratio": "anyres",
1875
- "anyres_patch_sampling": True,
1876
- }
1877
- logger.info(
1878
- "vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values."
1879
- )
1880
-
1881
- if vision_tokenizer_config is None:
1882
- vision_tokenizer_config = {}
1883
- logger.info(
1884
- "vision_tokenizer_config is None. Initializing the XGenMMVisionTokenizerConfig with default values."
1885
- )
1886
-
1887
- if text_config is None:
1888
- text_config = {
1889
- "initial_tokenizer_len": 32012,
1890
- "pad_token_id": 32011,
1891
- "bos_token_id": 1,
1892
- "eos_token_id": 32000,
1893
- "vocab_size": 32064,
1894
- "hidden_size": 3072,
1895
- "intermediate_size": 8192,
1896
- "num_hidden_layers": 32,
1897
- "num_attention_heads": 32,
1898
- "num_key_value_heads": 32,
1899
- "resid_pdrop": 0.0,
1900
- "embd_pdrop": 0.0,
1901
- "attention_dropout": 0.0,
1902
- "hidden_act": "silu",
1903
- "max_position_embeddings": 4096,
1904
- "original_max_position_embeddings": 4096,
1905
- "initializer_range": 0.02,
1906
- "rms_norm_eps": 1e-05,
1907
- "use_cache": True,
1908
- "rope_theta": 10000.0,
1909
- "rope_scaling": None,
1910
- "sliding_window": 2047,
1911
- "return_dict": True,
1912
- "output_hidden_states": False,
1913
- "output_attentions": False,
1914
- "torchscript": False,
1915
- "torch_dtype": "bfloat16",
1916
- "use_bfloat16": False,
1917
- "tf_legacy_loss": False,
1918
- "pruned_heads": {},
1919
- "tie_word_embeddings": False,
1920
- "chunk_size_feed_forward": 0,
1921
- "is_encoder_decoder": False,
1922
- "is_decoder": False,
1923
- "cross_attention_hidden_size": None,
1924
- "add_cross_attention": False,
1925
- "tie_encoder_decoder": False,
1926
- "max_length": 20,
1927
- "min_length": 0,
1928
- "do_sample": False,
1929
- "early_stopping": False,
1930
- "num_beams": 1,
1931
- "num_beam_groups": 1,
1932
- "diversity_penalty": 0.0,
1933
- "temperature": 1.0,
1934
- "top_k": 50,
1935
- "top_p": 1.0,
1936
- "typical_p": 1.0,
1937
- "repetition_penalty": 1.0,
1938
- "length_penalty": 1.0,
1939
- "no_repeat_ngram_size": 0,
1940
- "encoder_no_repeat_ngram_size": 0,
1941
- "bad_words_ids": None,
1942
- "num_return_sequences": 1,
1943
- "output_scores": False,
1944
- "return_dict_in_generate": False,
1945
- "forced_bos_token_id": None,
1946
- "forced_eos_token_id": None,
1947
- "remove_invalid_values": False,
1948
- "exponential_decay_length_penalty": None,
1949
- "suppress_tokens": None,
1950
- "begin_suppress_tokens": None,
1951
- "finetuning_task": None,
1952
- "id2label": {0: "LABEL_0", 1: "LABEL_1"},
1953
- "label2id": {"LABEL_0": 0, "LABEL_1": 1},
1954
- "tokenizer_class": None,
1955
- "prefix": None,
1956
- "bos_token_id": 1,
1957
- "pad_token_id": 32000,
1958
- "eos_token_id": 32000,
1959
- "sep_token_id": None,
1960
- "decoder_start_token_id": None,
1961
- "task_specific_params": None,
1962
- "problem_type": None,
1963
- "model_type": "phi3",
1964
- }
1965
- logger.info(
1966
- "text_config is None. Initializing the text config with default values (`Phi3Config`)."
1967
- )
1968
-
1969
- self.vision_encoder_config = XGenMMVisionEncoderConfig(**vision_encoder_config)
1970
-
1971
- self.vision_tokenizer_config = XGenMMVisionTokenizerConfig(
1972
- **vision_tokenizer_config
1973
- )
1974
-
1975
- text_model_type = (
1976
- text_config["model_type"] if "model_type" in text_config else "phi3"
1977
- )
1978
- self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
1979
-
1980
- for key in ["initial_tokenizer_len", "pad_token_id"]:
1981
- if key not in self.text_config.to_dict():
1982
- raise ValueError(f"The key `{key}` is missing in the text_config.")
1983
-
1984
- super().__init__(**kwargs)
1985
-
1986
-
1987
  class XGenMMVisionEncoder(PreTrainedModel):
1988
  main_input_name = "pixel_values"
1989
  config_class = XGenMMVisionEncoderConfig
 
18
  from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer
19
 
20
 
21
+ logger = logging.get_logger(__name__)
22
+
23
+
24
+ class XGenMMVisionEncoderConfig(PretrainedConfig):
25
+ model_type = "xgenmm_vision_encoder"
26
+
27
+ def __init__(
28
+ self,
29
+ model_name: str = "google/siglip-so400m-patch14-384",
30
+ anyres_grids: list[int] = [
31
+ [384, 768],
32
+ [768, 384],
33
+ [768, 768],
34
+ [1152, 384],
35
+ [384, 1152],
36
+ ],
37
+ **kwargs,
38
+ ):
39
+ self.model_name = model_name
40
+ self.anyres_grids = anyres_grids
41
+ super().__init__(**kwargs)
42
+
43
+
44
+ class XGenMMVisionTokenizerConfig(PretrainedConfig):
45
+ model_type = "xgenmm_vision_tokenizer"
46
+
47
+ def __init__(
48
+ self,
49
+ vis_feature_dim: int = 1152,
50
+ lang_embedding_dim: int = 3072,
51
+ num_vis_tokens: int = 128,
52
+ image_aspect_ratio: str = "anyres",
53
+ **kwargs,
54
+ ):
55
+ self.vis_feature_dim = vis_feature_dim
56
+ self.lang_embedding_dim = lang_embedding_dim
57
+ self.num_vis_tokens = num_vis_tokens
58
+ self.image_aspect_ratio = image_aspect_ratio
59
+ super().__init__(**kwargs)
60
+
61
+
62
+ class XGenMMConfig(PretrainedConfig):
63
+ model_type = "xgenmm"
64
+
65
+ def __init__(
66
+ self,
67
+ vision_encoder_config: dict = None,
68
+ vision_tokenizer_config: dict = None,
69
+ text_config: dict = None,
70
+ **kwargs,
71
+ ):
72
+
73
+ if vision_encoder_config is None:
74
+ vision_encoder_config = {
75
+ "image_aspect_ratio": "anyres",
76
+ "anyres_patch_sampling": True,
77
+ }
78
+ logger.info(
79
+ "vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values."
80
+ )
81
+
82
+ if vision_tokenizer_config is None:
83
+ vision_tokenizer_config = {}
84
+ logger.info(
85
+ "vision_tokenizer_config is None. Initializing the XGenMMVisionTokenizerConfig with default values."
86
+ )
87
+
88
+ if text_config is None:
89
+ text_config = {
90
+ "initial_tokenizer_len": 32012,
91
+ "pad_token_id": 32011,
92
+ "bos_token_id": 1,
93
+ "eos_token_id": 32000,
94
+ "vocab_size": 32064,
95
+ "hidden_size": 3072,
96
+ "intermediate_size": 8192,
97
+ "num_hidden_layers": 32,
98
+ "num_attention_heads": 32,
99
+ "num_key_value_heads": 32,
100
+ "resid_pdrop": 0.0,
101
+ "embd_pdrop": 0.0,
102
+ "attention_dropout": 0.0,
103
+ "hidden_act": "silu",
104
+ "max_position_embeddings": 4096,
105
+ "original_max_position_embeddings": 4096,
106
+ "initializer_range": 0.02,
107
+ "rms_norm_eps": 1e-05,
108
+ "use_cache": True,
109
+ "rope_theta": 10000.0,
110
+ "rope_scaling": None,
111
+ "sliding_window": 2047,
112
+ "return_dict": True,
113
+ "output_hidden_states": False,
114
+ "output_attentions": False,
115
+ "torchscript": False,
116
+ "torch_dtype": "bfloat16",
117
+ "use_bfloat16": False,
118
+ "tf_legacy_loss": False,
119
+ "pruned_heads": {},
120
+ "tie_word_embeddings": False,
121
+ "chunk_size_feed_forward": 0,
122
+ "is_encoder_decoder": False,
123
+ "is_decoder": False,
124
+ "cross_attention_hidden_size": None,
125
+ "add_cross_attention": False,
126
+ "tie_encoder_decoder": False,
127
+ "max_length": 20,
128
+ "min_length": 0,
129
+ "do_sample": False,
130
+ "early_stopping": False,
131
+ "num_beams": 1,
132
+ "num_beam_groups": 1,
133
+ "diversity_penalty": 0.0,
134
+ "temperature": 1.0,
135
+ "top_k": 50,
136
+ "top_p": 1.0,
137
+ "typical_p": 1.0,
138
+ "repetition_penalty": 1.0,
139
+ "length_penalty": 1.0,
140
+ "no_repeat_ngram_size": 0,
141
+ "encoder_no_repeat_ngram_size": 0,
142
+ "bad_words_ids": None,
143
+ "num_return_sequences": 1,
144
+ "output_scores": False,
145
+ "return_dict_in_generate": False,
146
+ "forced_bos_token_id": None,
147
+ "forced_eos_token_id": None,
148
+ "remove_invalid_values": False,
149
+ "exponential_decay_length_penalty": None,
150
+ "suppress_tokens": None,
151
+ "begin_suppress_tokens": None,
152
+ "finetuning_task": None,
153
+ "id2label": {0: "LABEL_0", 1: "LABEL_1"},
154
+ "label2id": {"LABEL_0": 0, "LABEL_1": 1},
155
+ "tokenizer_class": None,
156
+ "prefix": None,
157
+ "bos_token_id": 1,
158
+ "pad_token_id": 32000,
159
+ "eos_token_id": 32000,
160
+ "sep_token_id": None,
161
+ "decoder_start_token_id": None,
162
+ "task_specific_params": None,
163
+ "problem_type": None,
164
+ "model_type": "phi3",
165
+ }
166
+ logger.info(
167
+ "text_config is None. Initializing the text config with default values (`Phi3Config`)."
168
+ )
169
+
170
+ self.vision_encoder_config = XGenMMVisionEncoderConfig(**vision_encoder_config)
171
+
172
+ self.vision_tokenizer_config = XGenMMVisionTokenizerConfig(
173
+ **vision_tokenizer_config
174
+ )
175
+
176
+ text_model_type = (
177
+ text_config["model_type"] if "model_type" in text_config else "phi3"
178
+ )
179
+ self.text_config = CONFIG_MAPPING[text_model_type](**text_config)
180
+
181
+ for key in ["initial_tokenizer_len", "pad_token_id"]:
182
+ if key not in self.text_config.to_dict():
183
+ raise ValueError(f"The key `{key}` is missing in the text_config.")
184
+
185
+ super().__init__(**kwargs)
186
+
187
+
188
  def hasattr_recursive(obj, att):
189
  """
190
  Check if obj has nested attribute
 
1987
  return output
1988
 
1989
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1990
  class XGenMMVisionEncoder(PreTrainedModel):
1991
  main_input_name = "pixel_values"
1992
  config_class = XGenMMVisionEncoderConfig