adjust image processing for batch output

#63
config.json CHANGED
@@ -1,5 +1,6 @@
1
  {
2
  "_name_or_path": "openbmb/MiniCPM-Llama3-V-2_5",
 
3
  "architectures": [
4
  "MiniCPMV"
5
  ],
 
1
  {
2
  "_name_or_path": "openbmb/MiniCPM-Llama3-V-2_5",
3
+ "version": "2.5",
4
  "architectures": [
5
  "MiniCPMV"
6
  ],
image_processing_minicpmv.py CHANGED
@@ -396,7 +396,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
396
  if tgt_sizes:
397
  tgt_sizes = np.vstack(tgt_sizes)
398
  return MiniCPMVBatchFeature(
399
- data={"pixel_values": new_images, "image_sizes": image_sizes, "tgt_sizes": tgt_sizes}, tensor_type=return_tensors
400
  )
401
 
402
  AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
 
396
  if tgt_sizes:
397
  tgt_sizes = np.vstack(tgt_sizes)
398
  return MiniCPMVBatchFeature(
399
+ data={"pixel_values": [new_images], "image_sizes": [image_sizes], "tgt_sizes": [tgt_sizes]}, tensor_type=return_tensors
400
  )
401
 
402
  AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
processing_minicpmv.py CHANGED
@@ -61,14 +61,10 @@ class MiniCPMVProcessor(ProcessorMixin):
61
  return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
62
  ) -> MiniCPMVBatchFeature:
63
  """
64
- Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
65
- and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
66
- the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
67
- LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
68
- of the above two methods for more information.
69
 
70
  Args:
71
- text (`str`, `List[str]`, `List[List[str]]`):
72
  The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
73
  (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
74
  `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
@@ -176,19 +172,19 @@ class MiniCPMVProcessor(ProcessorMixin):
176
  images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
177
 
178
  image_tags = re.findall(pattern, texts)
179
- assert len(image_tags) == len(image_sizes)
180
  text_chunks = texts.split(pattern)
181
  final_texts = ""
182
  for i in range(len(image_tags)):
183
- final_texts = final_texts + text_chunks[i] + self.image_processor.get_slice_image_placeholder(image_sizes[i])
184
  final_texts += text_chunks[-1]
185
  input_ids, image_bounds = self._convert(final_texts, max_length)
186
  return MiniCPMVBatchFeature(data={
187
  "input_ids": input_ids,
188
- "pixel_values": [images],
189
- "image_sizes": [image_sizes],
190
  "image_bound": [image_bounds],
191
- "tgt_sizes": [tgt_sizes]
192
  })
193
 
194
  @property
@@ -244,4 +240,5 @@ class MiniCPMVProcessor(ProcessorMixin):
244
  else:
245
  tensor[i, : len(item[key][0]), :] = item[key][0].clone()
246
 
247
- return tensor
 
 
61
  return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
62
  ) -> MiniCPMVBatchFeature:
63
  """
64
+ Only support for single input for now. Batched input is coming soon.
 
 
 
 
65
 
66
  Args:
67
+ text (`str`):
68
  The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
69
  (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
70
  `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
 
172
  images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
173
 
174
  image_tags = re.findall(pattern, texts)
175
+ assert len(image_tags) == len(image_sizes[0])
176
  text_chunks = texts.split(pattern)
177
  final_texts = ""
178
  for i in range(len(image_tags)):
179
+ final_texts = final_texts + text_chunks[i] + self.image_processor.get_slice_image_placeholder(image_sizes[0][i])
180
  final_texts += text_chunks[-1]
181
  input_ids, image_bounds = self._convert(final_texts, max_length)
182
  return MiniCPMVBatchFeature(data={
183
  "input_ids": input_ids,
184
+ "pixel_values": images,
185
+ "image_sizes": image_sizes,
186
  "image_bound": [image_bounds],
187
+ "tgt_sizes": tgt_sizes
188
  })
189
 
190
  @property
 
240
  else:
241
  tensor[i, : len(item[key][0]), :] = item[key][0].clone()
242
 
243
+ return tensor
244
+