from transformers import PretrainedConfig from typing import List from transformers import Qwen2Config, CLIPVisionConfig class InfMLLMUnifiedHDChatConfig(PretrainedConfig): def __init__( self, vison_config=None, lm_config=None, lm_model="", lm_tokenizer="", lora_modules="", lora_llm=False, lora_r=128, lora_alpha=256, lora_dropout=0, # encoder_img="", image_size_img=336, lora_encoder_img=False, hd_num=9, # encoder_video="", # max_txt_len=4096, conv_style='qwen-7b-chat', precision="bf16", **kwargs ): self.lm_model = lm_model self.lm_tokenizer = lm_tokenizer self.lora_modules = lora_modules self.lora_llm = lora_llm self.lora_r = lora_r self.lora_alpha = lora_alpha self.lora_dropout = lora_dropout self.encoder_img = encoder_img self.image_size_img = image_size_img self.lora_encoder_img = lora_encoder_img self.hd_num = hd_num self.encoder_video = encoder_video self.max_txt_len = max_txt_len self.conv_style = conv_style self.precision = precision # print(vison_config, lm_config) if type(vison_config) == dict: self.vision_config = CLIPVisionConfig(**vison_config) else: self.vision_config = vison_config if type(lm_config) == dict: self.lm_config = Qwen2Config(**lm_config) else: self.lm_config = lm_config super().__init__(**kwargs)