Skip to content

vllm.model_executor.models.aya_vision

AyaVisionImagePixelInputs

Bases: TensorSchema

Dimensions
  • np: The total number of patches over each image over each prompt in the batch
  • c: Number of channels
  • h: Height of each image patch
  • w: Width of each image patch
  • bn: Batch size * number of images
Source code in vllm/model_executor/models/aya_vision.py
class AyaVisionImagePixelInputs(TensorSchema):
    """
    Dimensions:
        - np: The total number of patches over each image over each prompt in
              the batch
        - c: Number of channels
        - h: Height of each image patch
        - w: Width of each image patch
        - bn: Batch size * number of images
    """

    type: Literal["pixel_values"]

    pixel_values: Annotated[
        torch.Tensor,
        TensorShape("np", 3, "h", "w"),
    ]

    num_patches: Annotated[
        torch.Tensor,
        TensorShape("bn"),
    ]

AyaVisionProcessingInfo

Bases: BaseProcessingInfo

Source code in vllm/model_executor/models/aya_vision.py
class AyaVisionProcessingInfo(BaseProcessingInfo):
    def get_hf_config(self) -> AyaVisionConfig:
        return self.ctx.get_hf_config(AyaVisionConfig)

    def get_hf_processor(self, **kwargs: object) -> AyaVisionProcessor:
        return self.ctx.get_hf_processor(AyaVisionProcessor, **kwargs)

    def get_image_processor(self, **kwargs: object) -> GotOcr2ImageProcessor:
        return self.get_hf_processor(**kwargs).image_processor

    def get_supported_mm_limits(self) -> Mapping[str, int | None]:
        return {"image": None}

    def get_image_size_with_most_features(self) -> ImageSize:
        image_processor = self.get_image_processor()
        height = image_processor.size["height"]
        width = image_processor.size["width"]
        max_patches = image_processor.max_patches
        return ImageSize(height=height * max_patches, width=width * max_patches)

    def get_num_patches(
        self,
        *,
        image_width: int,
        image_height: int,
        size: dict,
        min_patches: int,
        max_patches: int,
    ) -> int:
        """
        Calculate the number of patches needed for a given image based on size
        constraints.  This method replicates and adjusts the logic from:
        transformers/models/got_ocr2/image_processing_got_ocr2
        """
        size = get_size_dict(size, default_to_square=False)
        num_columns, num_rows = get_optimal_tiled_canvas(
            (image_height, image_width),
            (size["height"], size["width"]),
            min_patches,
            max_patches,
        )
        num_blocks = num_columns * num_rows
        return num_blocks if num_blocks == 1 else num_blocks + 1

get_num_patches

get_num_patches(
    *,
    image_width: int,
    image_height: int,
    size: dict,
    min_patches: int,
    max_patches: int,
) -> int

Calculate the number of patches needed for a given image based on size constraints. This method replicates and adjusts the logic from: transformers/models/got_ocr2/image_processing_got_ocr2

Source code in vllm/model_executor/models/aya_vision.py
def get_num_patches(
    self,
    *,
    image_width: int,
    image_height: int,
    size: dict,
    min_patches: int,
    max_patches: int,
) -> int:
    """
    Calculate the number of patches needed for a given image based on size
    constraints.  This method replicates and adjusts the logic from:
    transformers/models/got_ocr2/image_processing_got_ocr2
    """
    size = get_size_dict(size, default_to_square=False)
    num_columns, num_rows = get_optimal_tiled_canvas(
        (image_height, image_width),
        (size["height"], size["width"]),
        min_patches,
        max_patches,
    )
    num_blocks = num_columns * num_rows
    return num_blocks if num_blocks == 1 else num_blocks + 1