khang119966 commited on
Commit
92e7e3a
1 Parent(s): f302e3b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +133 -103
app.py CHANGED
@@ -1,7 +1,16 @@
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, StoppingCriteria
4
- from modeling_llava_qwen2 import LlavaQwen2ForCausalLM
 
 
 
 
 
 
 
 
 
5
  from threading import Thread
6
  import re
7
  import time
@@ -13,117 +22,138 @@ subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENT
13
 
14
  torch.set_default_device('cuda')
15
 
16
- tokenizer = AutoTokenizer.from_pretrained(
17
- 'qnguyen3/nanoLLaVA-1.5',
18
- trust_remote_code=True)
19
-
20
- model = LlavaQwen2ForCausalLM.from_pretrained(
21
- 'qnguyen3/nanoLLaVA-1.5',
22
- torch_dtype=torch.float16,
23
- attn_implementation="flash_attention_2",
24
- trust_remote_code=True)
25
-
26
- model.to('cuda')
27
-
28
- class KeywordsStoppingCriteria(StoppingCriteria):
29
- def __init__(self, keywords, tokenizer, input_ids):
30
- self.keywords = keywords
31
- self.keyword_ids = []
32
- self.max_keyword_len = 0
33
- for keyword in keywords:
34
- cur_keyword_ids = tokenizer(keyword).input_ids
35
- if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
36
- cur_keyword_ids = cur_keyword_ids[1:]
37
- if len(cur_keyword_ids) > self.max_keyword_len:
38
- self.max_keyword_len = len(cur_keyword_ids)
39
- self.keyword_ids.append(torch.tensor(cur_keyword_ids))
40
- self.tokenizer = tokenizer
41
- self.start_len = input_ids.shape[1]
42
-
43
- def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
44
- offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
45
- self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
46
- for keyword_id in self.keyword_ids:
47
- truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
48
- if torch.equal(truncated_output_ids, keyword_id):
49
- return True
50
- outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
51
- for keyword in self.keywords:
52
- if keyword in outputs:
53
- return True
54
- return False
55
-
56
- def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
57
- outputs = []
58
- for i in range(output_ids.shape[0]):
59
- outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
60
- return all(outputs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
 
63
  @spaces.GPU
64
- def bot_streaming(message, history):
65
- messages = []
66
- if message["files"]:
67
- image = message["files"][-1]["path"]
 
68
  else:
69
- for i, hist in enumerate(history):
70
- if type(hist[0])==tuple:
71
- image = hist[0][0]
72
- image_turn = i
73
-
74
- if len(history) > 0 and image is not None:
75
- messages.append({"role": "user", "content": f'<image>\n{history[1][0]}'})
76
- messages.append({"role": "assistant", "content": history[1][1] })
77
- for human, assistant in history[2:]:
78
- messages.append({"role": "user", "content": human })
79
- messages.append({"role": "assistant", "content": assistant })
80
- messages.append({"role": "user", "content": message['text']})
81
- elif len(history) > 0 and image is None:
82
- for human, assistant in history:
83
- messages.append({"role": "user", "content": human })
84
- messages.append({"role": "assistant", "content": assistant })
85
- messages.append({"role": "user", "content": message['text']})
86
- elif len(history) == 0 and image is not None:
87
- messages.append({"role": "user", "content": f"<image>\n{message['text']}"})
88
- elif len(history) == 0 and image is None:
89
- messages.append({"role": "user", "content": message['text'] })
90
-
91
- # if image is None:
92
- # gr.Error("You need to upload an image for LLaVA to work.")
93
- image = Image.open(image).convert("RGB")
94
- text = tokenizer.apply_chat_template(
95
- messages,
96
- tokenize=False,
97
- add_generation_prompt=True)
98
- text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
99
- input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
100
- stop_str = '<|im_end|>'
101
- keywords = [stop_str]
102
- stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)
103
- streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
104
 
105
- image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
106
- generation_kwargs = dict(input_ids=input_ids.to('cuda'),
107
- images=image_tensor.to('cuda'),
108
- streamer=streamer, max_new_tokens=512,
109
- stopping_criteria=[stopping_criteria], temperature=0.01)
110
- generated_text = ""
111
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
112
- thread.start()
113
- text_prompt =f"<|im_start|>user\n{message['text']}<|im_end|>"
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  buffer = ""
116
- for new_text in streamer:
117
-
118
  buffer += new_text
119
-
120
  generated_text_without_prompt = buffer[:]
121
- time.sleep(0.04)
122
  yield generated_text_without_prompt
 
123
 
124
-
125
- demo = gr.ChatInterface(fn=bot_streaming, title="🚀nanoLLaVA-1.5", examples=[{"text": "Who is this guy?", "files":["./demo_1.jpg"]},
126
- {"text": "What does the text say?", "files":["./demo_2.jpeg"]}],
127
- description="Try [nanoLLaVA](https://huggingface.co/qnguyen3/nanoLLaVA-1.5) in this demo. Built on top of [Quyen-SE-v0.1](https://huggingface.co/vilm/Quyen-SE-v0.1) (Qwen1.5-0.5B) and [Google SigLIP-400M](https://huggingface.co/google/siglip-so400m-patch14-384). Upload an image and start chatting about it, or simply try one of the examples below. If you don't upload an image, you will receive an error.",
128
- stop_btn="Stop Generation", multimodal=True)
 
 
 
 
129
  demo.queue().launch()
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, StoppingCriteria
4
+ import gradio as gr
5
+ import spaces
6
+ import torch
7
+ import numpy as np
8
+ import torch
9
+ import torchvision.transforms as T
10
+ from PIL import Image
11
+ from torchvision.transforms.functional import InterpolationMode
12
+ from transformers import AutoModel, AutoTokenizer
13
+
14
  from threading import Thread
15
  import re
16
  import time
 
22
 
23
  torch.set_default_device('cuda')
24
 
25
+
26
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
27
+ IMAGENET_STD = (0.229, 0.224, 0.225)
28
+
29
+ def build_transform(input_size):
30
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
31
+ transform = T.Compose([
32
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
33
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
34
+ T.ToTensor(),
35
+ T.Normalize(mean=MEAN, std=STD)
36
+ ])
37
+ return transform
38
+
39
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
40
+ best_ratio_diff = float('inf')
41
+ best_ratio = (1, 1)
42
+ area = width * height
43
+ for ratio in target_ratios:
44
+ target_aspect_ratio = ratio[0] / ratio[1]
45
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
46
+ if ratio_diff < best_ratio_diff:
47
+ best_ratio_diff = ratio_diff
48
+ best_ratio = ratio
49
+ elif ratio_diff == best_ratio_diff:
50
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
51
+ best_ratio = ratio
52
+ return best_ratio
53
+
54
+ def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
55
+ orig_width, orig_height = image.size
56
+ aspect_ratio = orig_width / orig_height
57
+
58
+ # calculate the existing image aspect ratio
59
+ target_ratios = set(
60
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
61
+ i * j <= max_num and i * j >= min_num)
62
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
63
+
64
+ # find the closest aspect ratio to the target
65
+ target_aspect_ratio = find_closest_aspect_ratio(
66
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
67
+
68
+ # calculate the target width and height
69
+ target_width = image_size * target_aspect_ratio[0]
70
+ target_height = image_size * target_aspect_ratio[1]
71
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
72
+
73
+ # resize the image
74
+ resized_img = image.resize((target_width, target_height))
75
+ processed_images = []
76
+ for i in range(blocks):
77
+ box = (
78
+ (i % (target_width // image_size)) * image_size,
79
+ (i // (target_width // image_size)) * image_size,
80
+ ((i % (target_width // image_size)) + 1) * image_size,
81
+ ((i // (target_width // image_size)) + 1) * image_size
82
+ )
83
+ # split the image
84
+ split_img = resized_img.crop(box)
85
+ processed_images.append(split_img)
86
+ assert len(processed_images) == blocks
87
+ if use_thumbnail and len(processed_images) != 1:
88
+ thumbnail_img = image.resize((image_size, image_size))
89
+ processed_images.append(thumbnail_img)
90
+ return processed_images
91
+
92
+ def load_image(image_file, input_size=448, max_num=12):
93
+ image = Image.open(image_file).convert('RGB')
94
+ transform = build_transform(input_size=input_size)
95
+ images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
96
+ pixel_values = [transform(image) for image in images]
97
+ pixel_values = torch.stack(pixel_values)
98
+ return pixel_values
99
+
100
+ model = AutoModel.from_pretrained(
101
+ "5CD-AI/Viet-InternVL2-1B",
102
+ torch_dtype=torch.bfloat16,
103
+ low_cpu_mem_usage=True,
104
+ trust_remote_code=True,
105
+ ).eval().cuda()
106
+ tokenizer = AutoTokenizer.from_pretrained("5CD-AI/Viet-InternVL2-1B", trust_remote_code=True, use_fast=False)
107
 
108
 
109
  @spaces.GPU
110
+ def chat(message, history):
111
+ print(history)
112
+ print(message)
113
+ if len(history) == 0 or len(message["files"]) != 0:
114
+ test_image = message["files"][0]["path"]
115
  else:
116
+ test_image = history[0][0][0]
117
+
118
+ pixel_values = load_image(test_image, max_num=12).to(torch.bfloat16).cuda()
119
+ generation_config = dict(max_new_tokens= 1024, do_sample=True, num_beams = 3, repetition_penalty=2.5)
120
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
 
 
 
 
 
 
 
 
 
122
 
123
+ if len(history) == 0:
124
+ question = '<image>\n'+message["text"]
125
+ response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True)
126
+ else:
127
+ conv_history = []
128
+ for chat_pair in history:
129
+ if chat_pair[1] is not None:
130
+ if len(conv_history) == 0 and len(message["files"]) == 0:
131
+ chat_pair[0] = '<image>\n' + chat_pair[0]
132
+ conv_history.append(tuple(chat_pair))
133
+ print(conv_history)
134
+ if len(message["files"]) != 0:
135
+ question = '<image>\n'+message["text"]
136
+ else:
137
+ question = message["text"]
138
+ response, conv_history = model.chat(tokenizer, pixel_values, question, generation_config, history=conv_history, return_history=True)
139
+
140
+ print(f'User: {question}\nAssistant: {response}')
141
+
142
  buffer = ""
143
+ for new_text in response:
 
144
  buffer += new_text
 
145
  generated_text_without_prompt = buffer[:]
146
+ time.sleep(0.06)
147
  yield generated_text_without_prompt
148
+ # return response
149
 
150
+ demo = gr.ChatInterface(
151
+ fn=chat,
152
+ chatbot=gr.Chatbot(height=500),
153
+ description="""Try [Vintern-1B](https://huggingface.co/5CD-AI/Viet-InternVL2-1B) in this demo. Vintern 1B is a multimodal large language model series, featuring models of various sizes. For each size, we release instruction-tuned models optimized for multimodal tasks. Vintern-1B consists of InternViT-300M-448px, an MLP projector, and Qwen2-0.5B-Instruct.""",
154
+ examples=[{"text": "Tổng giá tiền trong hóa đơn là bao nhiêu ?", "files":["./demo_1.jpg"]},
155
+ {"text": "Mô tả hình ảnh một cách chi tiết.", "files":["./demo_2.jpg"]}],
156
+ title="❄️ Vintern-1B ❄️",
157
+ multimodal=True,
158
+ )
159
  demo.queue().launch()