File size: 8,762 Bytes
2908489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3bce52e
2908489
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import torch
import torch.amp.autocast_mode
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
import os
import sys
import logging
import warnings
import argparse
from PIL import Image
from pathlib import Path
from tqdm import tqdm
from torch import nn
from transformers import AutoModel, AutoProcessor, AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, AutoModelForCausalLM
from typing import List, Union

# Constants
CLIP_PATH = "google/siglip-so400m-patch14-384"
VLM_PROMPT = "A descriptive caption for this image:\n"
MODEL_PATH = "unsloth/Meta-Llama-3.1-8B-bnb-4bit"
CHECKPOINT_PATH = Path("wpkklhc6")
IMAGE_EXTENSIONS = ('.jpg', '.jpeg', '.png', '.bmp', '.webp')

warnings.filterwarnings("ignore", category=UserWarning)
logging.getLogger("transformers").setLevel(logging.ERROR)

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

class ImageAdapter(nn.Module):
    def __init__(self, input_features: int, output_features: int):
        super().__init__()
        self.linear1 = nn.Linear(input_features, output_features)
        self.activation = nn.GELU()
        self.linear2 = nn.Linear(output_features, output_features)
    
    def forward(self, vision_outputs: torch.Tensor):
        return self.linear2(self.activation(self.linear1(vision_outputs)))

def load_models(rank):
    print(f"Loading CLIP πŸ“Ž on GPU {rank}")
    clip_processor = AutoProcessor.from_pretrained(CLIP_PATH)
    clip_model = AutoModel.from_pretrained(CLIP_PATH).vision_model.eval().requires_grad_(False).to(rank)

    print(f"Loading tokenizer πŸͺ™ on GPU {rank}")
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=False)
    assert isinstance(tokenizer, (PreTrainedTokenizer, PreTrainedTokenizerFast)), f"Tokenizer is of type {type(tokenizer)}"

    print(f"Loading LLM πŸ€– on GPU {rank}")
    text_model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, device_map={"": rank}, torch_dtype=torch.bfloat16).eval()

    print(f"Loading image adapter πŸ–ΌοΈ on GPU {rank}")
    image_adapter = ImageAdapter(clip_model.config.hidden_size, text_model.config.hidden_size)
    image_adapter.load_state_dict(torch.load(CHECKPOINT_PATH / "image_adapter.pt", map_location=f"cuda:{rank}", weights_only=True))
    image_adapter.eval().to(rank)

    return clip_processor, clip_model, tokenizer, text_model, image_adapter

@torch.no_grad()
def stream_chat(input_images: List[Image.Image], batch_size: int, pbar: tqdm, models: tuple, rank: int) -> List[str]:
    clip_processor, clip_model, tokenizer, text_model, image_adapter = models
    torch.cuda.empty_cache()
    all_captions = []

    for i in range(0, len(input_images), batch_size):
        batch = input_images[i:i+batch_size]
        
        try:
            images = clip_processor(images=batch, return_tensors='pt', padding=True).pixel_values.to(rank)
        except ValueError as e:
            print(f"Error processing image batch: {e}")
            print("Skipping this batch and continuing...")
            continue

        with torch.amp.autocast_mode.autocast(device_type='cuda', enabled=True):
            vision_outputs = clip_model(pixel_values=images, output_hidden_states=True)
            image_features = vision_outputs.hidden_states[-2]
            embedded_images = image_adapter(image_features).to(dtype=torch.bfloat16)

        prompt = tokenizer.encode(VLM_PROMPT, return_tensors='pt')
        prompt_embeds = text_model.model.embed_tokens(prompt.to(rank)).to(dtype=torch.bfloat16)
        embedded_bos = text_model.model.embed_tokens(torch.tensor([[tokenizer.bos_token_id]], device=rank, dtype=torch.int64)).to(dtype=torch.bfloat16)

        inputs_embeds = torch.cat([
            embedded_bos.expand(embedded_images.shape[0], -1, -1),
            embedded_images,
            prompt_embeds.expand(embedded_images.shape[0], -1, -1),
        ], dim=1).to(dtype=torch.bfloat16)

        input_ids = torch.cat([
            torch.tensor([[tokenizer.bos_token_id]], dtype=torch.long).expand(embedded_images.shape[0], -1),
            torch.zeros((embedded_images.shape[0], embedded_images.shape[1]), dtype=torch.long),
            prompt.expand(embedded_images.shape[0], -1),
        ], dim=1).to(rank)

        attention_mask = torch.ones_like(input_ids)

        generate_ids = text_model.generate(
            input_ids=input_ids,
            inputs_embeds=inputs_embeds,
            attention_mask=attention_mask,
            max_new_tokens=300,
            do_sample=True,
            top_k=10,
            temperature=0.5,
        )

        generate_ids = generate_ids[:, input_ids.shape[1]:]

        for ids in generate_ids:
            caption = tokenizer.decode(ids[:-1] if ids[-1] == tokenizer.eos_token_id else ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
            caption = caption.replace('<|end_of_text|>', '').replace('<|finetune_right_pad_id|>', '').strip()
            all_captions.append(caption)

        if pbar and rank == 0:
            pbar.update(len(batch))

    return all_captions

def process_directory(rank, world_size, input_dir: Path, output_dir: Path, batch_size: int, models: tuple):
    output_dir.mkdir(parents=True, exist_ok=True)
    image_files = [f for f in input_dir.iterdir() if f.suffix.lower() in IMAGE_EXTENSIONS]
    images_to_process = [f for f in image_files if not (output_dir / f"{f.stem}.txt").exists()]

    if not images_to_process:
        if rank == 0:
            print("No new images to process.")
        return

    # Distribute images across GPUs
    images_per_gpu = len(images_to_process) // world_size
    start_idx = rank * images_per_gpu
    end_idx = start_idx + images_per_gpu if rank < world_size - 1 else len(images_to_process)
    gpu_images = images_to_process[start_idx:end_idx]

    if rank == 0:
        pbar = tqdm(total=len(images_to_process), desc="Processing images", unit="image")
    else:
        pbar = None

    for i in range(0, len(gpu_images), batch_size):
        batch_files = gpu_images[i:i+batch_size]
        batch_images = [Image.open(f).convert('RGB') for f in batch_files]

        captions = stream_chat(batch_images, batch_size, pbar, models, rank)
        
        for file, caption in zip(batch_files, captions):
            with open(output_dir / f"{file.stem}.txt", 'w', encoding='utf-8') as f:
                f.write(caption)

        for img in batch_images:
            img.close()

    if rank == 0:
        pbar.close()

def parse_arguments():
    parser = argparse.ArgumentParser(description="Process images and generate captions.")
    parser.add_argument("input", nargs='+', help="Input image file or directory (or multiple directories)")
    parser.add_argument("--output", help="Output directory (optional)")
    parser.add_argument("--bs", type=int, default=4, help="Batch size (default: 4)")
    return parser.parse_args()

def run(rank, world_size, args):
    setup(rank, world_size)
    
    input_paths = [Path(input_path) for input_path in args.input]
    batch_size = args.bs
    models = load_models(rank)

    for input_path in input_paths:
        if input_path.is_file() and input_path.suffix.lower() in IMAGE_EXTENSIONS:
            if rank == 0:
                output_path = input_path.with_suffix('.txt')
                print(f"Processing single image 🎞️: {input_path.name}")
                with tqdm(total=1, desc="Processing image", unit="image") as pbar:
                    captions = stream_chat([Image.open(input_path).convert('RGB')], 1, pbar, models, rank)
                    with open(output_path, 'w', encoding='utf-8') as f:
                        f.write(captions[0])
                print(f"Output saved to {output_path}")
        elif input_path.is_dir():
            output_path = Path(args.output) if args.output else input_path
            if rank == 0:
                print(f"Processing directory πŸ“: {input_path}")
                print(f"Output directory πŸ“¦: {output_path}")
                print(f"Batch size πŸ—„οΈ: {batch_size}")
            process_directory(rank, world_size, input_path, output_path, batch_size, models)
        else:
            if rank == 0:
                print(f"Invalid input: {input_path}")
                print("Skipping...")

    cleanup()

def main():
    args = parse_arguments()
    world_size = torch.cuda.device_count()
    if world_size > 1:
        mp.spawn(run, args=(world_size, args), nprocs=world_size, join=True)
    else:
        run(0, 1, args)

if __name__ == "__main__":
    main()