from transformers import PretrainedConfig from transformers import logging from transformers import CONFIG_MAPPING logger = logging.get_logger(__name__) class XGenMMVisionEncoderConfig(PretrainedConfig): model_type = "xgenmm_vision_encoder" def __init__(self, model_name: str = 'google/siglip-so400m-patch14-384', anyres_grids: list[int] = [[384, 768],[768, 384],[768, 768],[1152, 384],[384,1152]], **kwargs): self.model_name = model_name self.anyres_grids = anyres_grids super().__init__(**kwargs) class XGenMMVisionTokenizerConfig(PretrainedConfig): model_type = "xgenmm_vision_tokenizer" def __init__(self, vis_feature_dim: int = 1152, lang_embedding_dim: int = 3072, num_vis_tokens: int = 128, image_aspect_ratio: str = 'anyres', **kwargs): self.vis_feature_dim = vis_feature_dim self.lang_embedding_dim = lang_embedding_dim self.num_vis_tokens = num_vis_tokens self.image_aspect_ratio = image_aspect_ratio super().__init__(**kwargs) class XGenMMConfig(PretrainedConfig): model_type = "xgenmm" def __init__(self, vision_encoder_config: dict = None, vision_tokenizer_config: dict = None, text_config: dict = None, **kwargs): if vision_encoder_config is None: vision_encoder_config = {'image_aspect_ratio': 'anyres', 'anyres_patch_sampling': True} logger.info("vision_encoder_config is None. initializing the XGenMMVisionEncoderConfig with default values.") if vision_tokenizer_config is None: vision_tokenizer_config = {} logger.info("vision_tokenizer_config is None. Initializing the XGenMMVisionTokenizerConfig with default values.") if text_config is None: text_config = { 'initial_tokenizer_len':32012, 'pad_token_id':32011, 'bos_token_id':1, 'eos_token_id':32000, 'vocab_size': 32064, 'hidden_size': 3072, 'intermediate_size': 8192, 'num_hidden_layers': 32, 'num_attention_heads': 32, 'num_key_value_heads': 32, 'resid_pdrop': 0.0, 'embd_pdrop': 0.0, 'attention_dropout': 0.0, 'hidden_act': 'silu', 'max_position_embeddings': 4096, 'original_max_position_embeddings': 4096, 'initializer_range': 0.02, 'rms_norm_eps': 1e-05, 'use_cache': True, 'rope_theta': 10000.0, 'rope_scaling': None, 'sliding_window': 2047, 'return_dict': True, 'output_hidden_states': False, 'output_attentions': False, 'torchscript': False, 'torch_dtype': 'bfloat16', 'use_bfloat16': False, 'tf_legacy_loss': False, 'pruned_heads': {}, 'tie_word_embeddings': False, 'chunk_size_feed_forward': 0, 'is_encoder_decoder': False, 'is_decoder': False, 'cross_attention_hidden_size': None, 'add_cross_attention': False, 'tie_encoder_decoder': False, 'max_length': 20, 'min_length': 0, 'do_sample': False, 'early_stopping': False, 'num_beams': 1, 'num_beam_groups': 1, 'diversity_penalty': 0.0, 'temperature': 1.0, 'top_k': 50, 'top_p': 1.0, 'typical_p': 1.0, 'repetition_penalty': 1.0, 'length_penalty': 1.0, 'no_repeat_ngram_size': 0, 'encoder_no_repeat_ngram_size': 0, 'bad_words_ids': None, 'num_return_sequences': 1, 'output_scores': False, 'return_dict_in_generate': False, 'forced_bos_token_id': None, 'forced_eos_token_id': None, 'remove_invalid_values': False, 'exponential_decay_length_penalty': None, 'suppress_tokens': None, 'begin_suppress_tokens': None, 'finetuning_task': None, 'id2label': {0: 'LABEL_0', 1: 'LABEL_1'}, 'label2id': {'LABEL_0': 0, 'LABEL_1': 1}, 'tokenizer_class': None, 'prefix': None, 'bos_token_id': 1, 'pad_token_id': 32000, 'eos_token_id': 32000, 'sep_token_id': None, 'decoder_start_token_id': None, 'task_specific_params': None, 'problem_type': None, 'model_type': 'phi3' } logger.info("text_config is None. Initializing the text config with default values (`Phi3Config`).") self.vision_encoder_config = XGenMMVisionEncoderConfig(**vision_encoder_config) self.vision_tokenizer_config = XGenMMVisionTokenizerConfig(**vision_tokenizer_config) text_model_type = text_config["model_type"] if "model_type" in text_config else "phi3" self.text_config = CONFIG_MAPPING[text_model_type](**text_config) for key in ['initial_tokenizer_len', 'pad_token_id']: if key not in self.text_config.to_dict(): raise ValueError(f"The key `{key}` is missing in the text_config.") super().__init__(**kwargs) @classmethod def from_vision_encoder_vision_tokenizer_text_configs( cls, vision_encoder_config: XGenMMVisionEncoderConfig, vision_tokenizer_config: XGenMMVisionTokenizerConfig, text_config: PretrainedConfig, **kwargs): return cls( vision_encoder_config=vision_encoder_config.to_dict(), vision_tokenizer_config=vision_tokenizer_config.to_dict(), text_config=text_config.to_dict(), **kwargs, )