OPEN_VOCABULARY_DETECTION returns polygons which cannot be displayed

#16
by MoritzLaurer HF staff - opened

When I run OPEN_VOCABULARY_DETECTION on some images, it return polygons instead of bboxes. It seems like the resulting polygons cannot be displayed on the image with the example code from the example notebook

task_prompt = '<OPEN_VOCABULARY_DETECTION>'  

def florence2_inference(task_prompt, text_input=None):
    if text_input is None:
        prompt = task_prompt
    else:
        prompt = task_prompt + text_input
    inputs = processor(text=prompt, images=image, return_tensors="pt")
    generated_ids = model.generate(
      input_ids=inputs["input_ids"],
      pixel_values=inputs["pixel_values"],
      max_new_tokens=1024,
      early_stopping=False,
      do_sample=False,
      num_beams=3,
    )
    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=False)[0]
    parsed_answer = processor.post_process_generation(
        generated_text, 
        task=task_prompt, 
        image_size=(image.width, image.height)
    )

    return parsed_answer

results = florence2_inference(task_prompt, text_input="The face of the character in the middle of the illustration. The expression is neutral, with distinct hair and facial features. The head is tilted slightly to the right.")
print(results)

{'<OPEN_VOCABULARY_DETECTION>': {'bboxes': [], 'bboxes_labels': [], 'polygons': [[[0.49000000953674316, 0.367000013589859, 979.510009765625, 0.367000013589859, 979.510009765625, 1.1010000705718994, 0.49000000953674316, 0.367000013589859]]], 'polygons_labels': ['The star located at the top left corner of the illustration. Small star with a bright yellow color and five points, slightly tilted to the left.']}}

Trying to draw the resulting polygon on an image results in horizontal lines.

from PIL import Image, ImageDraw, ImageFont 
import random
import numpy as np
colormap = ['blue','orange','green','purple','brown','pink','gray','olive','cyan','red',
            'lime','indigo','violet','aqua','magenta','coral','gold','tan','skyblue']

def draw_polygons(image, prediction, fill_mask=False):  
    """  
    Draws segmentation masks with polygons on an image.  
  
    Parameters:  
    - image_path: Path to the image file.  
    - prediction: Dictionary containing 'polygons' and 'labels' keys.  
                  'polygons' is a list of lists, each containing vertices of a polygon.  
                  'labels' is a list of labels corresponding to each polygon.  
    - fill_mask: Boolean indicating whether to fill the polygons with color.  
    """  
    # Load the image  
   
    draw = ImageDraw.Draw(image)  
      
   
    # Set up scale factor if needed (use 1 if not scaling)  
    scale = 1  
      
    # Iterate over polygons and labels  
    for polygons, label in zip(prediction['polygons'], prediction['polygons_labels']):  
        color = random.choice(colormap)  
        fill_color = random.choice(colormap) if fill_mask else None  
          
        for _polygon in polygons:  
            _polygon = np.array(_polygon).reshape(-1, 2)  
            if len(_polygon) < 3:  
                print('Invalid polygon:', _polygon)  
                continue  
              
            _polygon = (_polygon * scale).reshape(-1).tolist()  
              
            # Draw the polygon  
            if fill_mask:  
                draw.polygon(_polygon, outline=color, fill=fill_color)  
            else:  
                draw.polygon(_polygon, outline=color)  
              
            # Draw the label text  
            draw.text((_polygon[0] + 8, _polygon[1] + 2), label, fill=color)  
  
    # Save or display the image  
    #image.show()  # Display the image  
    #display(image)
    return image


image = resize_image_from_url(image_url)

# Draw annotations
annotated_image = draw_polygons(image, results["<OPEN_VOCABULARY_DETECTION>"])

annotated_image.show()

Maybe my text inputs are too complex, which leads to out-of-distribution issues?

Sign up or log in to comment