Batch inferencing bug: All outputs in the same batch has the same prediction, even for different images

#8
by weihf - opened

Model is great, but for batch inferencing, the problem is, even for different images, the output is the same, it seems to be just doing inferencing on a single image.
This is my test code:

from transformers import AutoProcessor, Kosmos2_5ForConditionalGeneration
from PIL import Image
import torch
from tqdm import tqdm
import os

DTYPE = torch.bfloat16

def initialize_model(model_name: str):
    print(f"Initializing {model_name} model")
    model = Kosmos2_5ForConditionalGeneration.from_pretrained(model_name, device_map="auto", torch_dtype=DTYPE)
    processor = AutoProcessor.from_pretrained(model_name)
    return model, processor

def test_kosmos_batch(
        model, 
        processor, 
        image_dir,
        prompt="<md>",
        batch_size=1,
        device = "cuda",
        dtype = torch.bfloat16,
        max_new_tokens=1024,
    ):

    # Read image paths from the image_dir string
    image_paths = [os.path.join(image_dir, img) for img in os.listdir(image_dir)]

    outputs = []
    num_batches = (len(image_paths) + batch_size - 1) // batch_size

    for i in tqdm(range(num_batches)):
        batch_paths = image_paths[i * batch_size : (i + 1) * batch_size]
        images = [Image.open(path) for path in batch_paths]

        inputs = processor(
            text=[prompt] * len(images),
            images=images,
            return_tensors="pt",
            padding=True,
        )

        inputs = {k: v.to(device) if v is not None else None for k, v in inputs.items()}
        inputs["flattened_patches"] = inputs["flattened_patches"].to(dtype)
        print("Processing inputs completed")
        try:
            del inputs["width"]
            del inputs["height"]
        except KeyError:
            pass

        generated_ids = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
        )
        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)
        outputs.extend(generated_text)
    
    for i, output in enumerate(outputs):
        print("=========================================")
        print(f"Output {i}: {output}")

    return outputs

if __name__ == "__main__":
    model_name = "microsoft/kosmos-2.5"
    model, processor = initialize_model(model_name)
    print("Model initialized successfully!")
    image_paths = "./images"
    test_kosmos_batch(model, processor, image_paths, batch_size=3, max_new_tokens=50)

Output is for the same image, which should not be the case:

=========================================
Output 0: <md>**Section 10.** **Effectivity.** This Circular shall take effect fifteen (15) calendar days following its publication either in the Official Gazette or in a newspaper of general circulation.

# Classification: GENERAL
=========================================
Output 1: <md>10\. *Effectivity*. This Circular shall take effect fifteen (15) calendar days following its publication either in the Official Gazette or in a newspaper of general circulation.

# Classification: GENERAL
=========================================
Output 2: <md># Section 10. Effectivity. This Circular shall take effect fifteen (15) calendar days following its publication in the Official Gazette or in a newspaper of general circulation.

# 1. The effectivity of this Circular shall be determined by the following

Sign up or log in to comment