Source code for otx.data.transform_libs.utils

# Copyright (C) 2024 Intel Corporation
# SPDX-License-Identifier: Apache-2.0

# Copyright (c) OpenMMLab. All rights reserved.

"""Utils for data transform functions."""

from __future__ import annotations

import copy
import functools
import inspect
import itertools
import weakref
from typing import TYPE_CHECKING, Sequence

import cv2
import numpy as np
import torch
from shapely import geometry
from torch import BoolTensor, Tensor

if TYPE_CHECKING:
    from datumaro import Polygon


CV2_INTERP_CODES = {
    "nearest": cv2.INTER_NEAREST,
    "bilinear": cv2.INTER_LINEAR,
    "bicubic": cv2.INTER_CUBIC,
    "area": cv2.INTER_AREA,
    "lanczos": cv2.INTER_LANCZOS4,
}



[docs]
class cache_randomness:  # noqa: N801
    """Decorator that marks the method with random return value(s) in a transform class.

    Reference : https://github.com/open-mmlab/mmcv/blob/v2.1.0/mmcv/transforms/utils.py#L15-L87

    This decorator is usually used together with the context-manager
    :func`:cache_random_params`. In this context, a decorated method will
    cache its return value(s) at the first time of being invoked, and always
    return the cached values when being invoked again.

    .. note::
        Only an instance method can be decorated with ``cache_randomness``.
    """

    def __init__(self, func):  # noqa: ANN001
        # Check `func` is to be bound as an instance method
        if not inspect.isfunction(func):
            msg = "Unsupport callable to decorate with@cache_randomness."
            raise TypeError(msg)
        func_args = inspect.getfullargspec(func).args
        if len(func_args) == 0 or func_args[0] != "self":
            msg = (
                "@cache_randomness should only be used to decorate instance methods (the first argument is ``self``).",
            )
            raise TypeError(msg)

        functools.update_wrapper(self, func)
        self.func = func
        self.instance_ref = None

    def __set_name__(self, owner, name):  # noqa: ANN001
        # Maintain a record of decorated methods in the class
        if not hasattr(owner, "_methods_with_randomness"):
            owner._methods_with_randomness = []  # noqa: SLF001

        # Here `name` equals to `self.__name__`, i.e., the name of the
        # decorated function, due to the invocation of `update_wrapper` in
        # `self.__init__()`
        owner._methods_with_randomness.append(name)  # noqa: SLF001


[docs]
    def __call__(self, *args, **kwargs):  # noqa: D102
        # Get the transform instance whose method is decorated
        # by cache_randomness
        instance = self.instance_ref()
        name = self.__name__

        # Check the flag ``self._cache_enabled``, which should be
        # set by the contextmanagers like ``cache_random_parameters```
        cache_enabled = getattr(instance, "_cache_enabled", False)

        if cache_enabled:
            # Initialize the cache of the transform instances. The flag
            # ``cache_enabled``` is set by contextmanagers like
            # ``cache_random_params```.
            if not hasattr(instance, "_cache"):
                instance._cache = {}  # noqa: SLF001

            if name not in instance._cache:  # noqa: SLF001
                instance._cache[name] = self.func(instance, *args, **kwargs)  # noqa: SLF001
            # Return the cached value
            return instance._cache[name]  # noqa: SLF001

        # Clear cache
        if hasattr(instance, "_cache"):
            del instance._cache  # noqa: SLF001
        # Return function output
        return self.func(instance, *args, **kwargs)


    def __get__(self, obj, cls):  # noqa: ANN001
        self.instance_ref = weakref.ref(obj)
        # Return a copy to avoid multiple transform instances sharing
        # one `cache_randomness` instance, which may cause data races
        # in multithreading cases.
        return copy.copy(self)




[docs]
def get_image_shape(img: np.ndarray | Tensor | list) -> tuple[int, int]:
    """Get image(s) shape with (height, width)."""
    if not isinstance(img, (np.ndarray, Tensor, list)):
        msg = f"{type(img)} is not supported."
        raise TypeError(msg)

    if isinstance(img, np.ndarray):
        return img.shape[:2]
    if isinstance(img, Tensor):
        return img.shape[-2:]
    return get_image_shape(img[0])  # for list




[docs]
def to_np_image(img: np.ndarray | Tensor | list) -> np.ndarray | list[np.ndarray]:
    """Convert torch.Tensor 3D image to numpy 3D image.

    TODO (sungchul): move it into base data entity?

    """
    if isinstance(img, np.ndarray):
        return img
    if isinstance(img, list):
        return [to_np_image(im) for im in img]
    return np.ascontiguousarray(img.numpy().transpose(1, 2, 0))




[docs]
def rescale_bboxes(boxes: Tensor, scale_factor: tuple[float, float]) -> Tensor:
    """Rescale boxes w.r.t. rescale_factor in-place.

    Note:
        Both ``rescale_`` and ``resize_`` will enlarge or shrink boxes
        w.r.t ``scale_facotr``. The difference is that ``resize_`` only
        changes the width and the height of boxes, but ``rescale_`` also
        rescales the box centers simultaneously.

    Args:
        boxes (Tensor): bounding boxes to be rescaled.
        scale_factor (tuple[float, float]): factors for scaling boxes with (height, width).
            It will be used after flipped. The length should be 2.

    Returns:
        (Tensor): rescaled bounding boxes.
    """
    assert len(scale_factor) == 2  # noqa: S101
    scale_factor = boxes.new_tensor(scale_factor[::-1]).repeat(2)
    return boxes * scale_factor




[docs]
def rescale_masks(
    masks: np.ndarray,
    scale_factor: float | tuple[float, float],  # (H, W)
    interpolation: str = "nearest",
) -> np.ndarray:
    """Rescale masks as large as possible while keeping the aspect ratio.

    Args:
        masks (np.ndarray): Masks to be rescaled.
        scale_factor (float | tuple[float, float]): Scale factor to be applied to masks with (height, width).
        interpolation (str): Interpolation mode. Defaults to `nearest`.

    Returns:
        (np.ndarray) : The rescaled masks.
    """
    h, w = masks.shape[1:]
    new_size = rescale_size((h, w), scale_factor)  # (H, W)

    # flipping `new_size` is required because cv2.resize uses (W, H)
    return np.stack(
        [cv2.resize(mask, new_size[::-1], interpolation=CV2_INTERP_CODES[interpolation]) for mask in masks],
    )




[docs]
def rescale_polygons(polygons: list[Polygon], scale_factor: float | tuple[float, float]) -> list[Polygon]:
    """Rescale polygons as large as possible while keeping the aspect ratio.

    Args:
        polygons (np.ndarray): Polygons to be rescaled.
        scale_factor (float | tuple[float, float]): Scale factor to be applied to polygons with (height, width)
            or single float value.

    Returns:
        (np.ndarray) : The rescaled polygons.
    """
    if isinstance(scale_factor, float):
        w_scale = h_scale = scale_factor
    else:
        h_scale, w_scale = scale_factor

    for polygon in polygons:
        p = np.asarray(polygon.points, dtype=np.float32)
        p[0::2] *= w_scale
        p[1::2] *= h_scale
        polygon.points = p.tolist()
    return polygons




[docs]
def rescale_keypoints(keypoints: Tensor, scale_factor: float | tuple[float, float]) -> Tensor:
    """Rescale keypoints as large as possible while keeping the aspect ratio.

    Args:
        keypoints (Tensor): Keypoints to be rescaled.
        scale_factor (float | tuple[float, float]): Scale factor to be applied to keypoints with (height, width)
            or single float value.

    Returns:
        (Tensor): The rescaled keypoints.
    """
    if isinstance(scale_factor, float):
        w_scale = h_scale = scale_factor
    else:
        h_scale, w_scale = scale_factor

    keypoints[:, 0] *= w_scale
    keypoints[:, 1] *= h_scale
    return keypoints




[docs]
def translate_bboxes(boxes: Tensor, distances: Sequence[float]) -> Tensor:
    """Translate boxes in-place.

    Args:
        boxes (Tensor): Bounding boxes to be translated.
        distances (Sequence[float]): Translate distances. The first
            is horizontal distance and the second is vertical distance.

    Returns:
        (Tensor): Translated bounding boxes.
    """
    assert len(distances) == 2  # noqa: S101
    return boxes + boxes.new_tensor(distances).repeat(2)




[docs]
def translate_masks(
    masks: np.ndarray,
    out_shape: tuple[int, int],
    offset: int | float,
    direction: str = "horizontal",
    border_value: int | tuple[int] = 0,
    interpolation: str = "bilinear",
) -> np.ndarray:
    """Translate the masks.

    Args:
        masks (np.ndarray): Masks to be translated.
        out_shape (tuple[int]): Shape for output mask, format (h, w).
        offset (int | float): The offset for translate.
        direction (str): The translate direction, either "horizontal" or "vertical".
        border_value (int | tuple[int]): Border value. Default 0 for masks.
        interpolation (str): Interpolation method, accepted values are
            'nearest', 'bilinear', 'bicubic', 'area', 'lanczos'. Defaults to
            'bilinear'.

    Returns:
        (np.ndarray): Translated BitmapMasks.
    """
    dtype = masks.dtype
    if masks.shape[-2:] != out_shape:
        empty_masks = np.zeros((masks.shape[0], *out_shape), dtype=dtype)
        min_h = min(out_shape[0], masks.shape[1])
        min_w = min(out_shape[1], masks.shape[2])
        empty_masks[:, :min_h, :min_w] = masks[:, :min_h, :min_w]
        masks = empty_masks

    # from https://github.com/open-mmlab/mmcv/blob/v2.1.0/mmcv/image/geometric.py#L740-L788
    height, width = masks.shape[1:]
    if masks.ndim == 2:
        channels = 1
    elif masks.ndim == 3:
        channels = masks.shape[0]

    if isinstance(border_value, int):
        border_value = tuple([border_value] * channels)  # type: ignore[assignment]
    elif isinstance(border_value, tuple):
        assert len(border_value) == channels, (  # noqa: S101
            "Expected the num of elements in tuple equals the channels"
            f"of input image. Found {len(border_value)} vs {channels}"
        )
    else:
        msg = f"Invalid type {type(border_value)} for `border_value`."
        raise ValueError(msg)  # noqa: TRY004

    translate_matrix = _get_translate_matrix(offset, direction)
    translated_masks = cv2.warpAffine(
        masks.transpose((1, 2, 0)),
        translate_matrix,
        (width, height),
        # Note case when the number elements in `border_value`
        # greater than 3 (e.g. translating masks whose channels
        # large than 3) will raise TypeError in `cv2.warpAffine`.
        # Here simply slice the first 3 values in `border_value`.
        borderValue=border_value[:3],  # type: ignore[index]
        flags=CV2_INTERP_CODES[interpolation],
    )

    if translated_masks.ndim == 2:
        translated_masks = translated_masks[:, :, None]
    return translated_masks.transpose((2, 0, 1)).astype(dtype)




[docs]
def translate_polygons(
    polygons: list[Polygon],
    out_shape: tuple[int, int],
    offset: int | float,
    direction: str = "horizontal",
    border_value: int | float = 0,
) -> list[Polygon]:
    """Translate polygons."""
    assert (  # noqa: S101
        border_value is None or border_value == 0
    ), f"Here border_value is not used, and defaultly should be None or 0. got {border_value}."

    axis = 0 if direction == "horizontal" else 1
    out = out_shape[1] if direction == "horizontal" else out_shape[0]

    for polygon in polygons:
        p = np.asarray(polygon.points)
        p[axis::2] = np.clip(p[axis::2] + offset, 0, out)
        polygon.points = p.tolist()
    return polygons



def _get_translate_matrix(offset: int | float, direction: str = "horizontal") -> np.ndarray:
    """Generate the translate matrix.

    Args:
        offset (int | float): The offset used for translate.
        direction (str): The translate direction, either
            "horizontal" or "vertical".

    Returns:
        ndarray: The translate matrix with dtype float32.
    """
    if direction == "horizontal":
        translate_matrix = np.float32([[1, 0, offset], [0, 1, 0]])
    elif direction == "vertical":
        translate_matrix = np.float32([[1, 0, 0], [0, 1, offset]])
    return translate_matrix



[docs]
def clip_bboxes(boxes: Tensor, img_shape: tuple[int, int]) -> Tensor:
    """Clip boxes according to the image shape in-place.

    Args:
        img_shape (tuple[int, int]): A tuple of image height and width.

    Returns:
        (Tensor): Clipped boxes.
    """
    h, w = img_shape
    boxes[..., 0::2] = boxes[..., 0::2].clamp(0, w)
    boxes[..., 1::2] = boxes[..., 1::2].clamp(0, h)
    return boxes




[docs]
def is_inside_bboxes(
    boxes: Tensor,
    img_shape: tuple[int, int],
    all_inside: bool = False,
    allowed_border: int = 0,
) -> BoolTensor:
    """Find boxes inside the image.

    Args:
        boxes (Tensor): Bounding boxes to be checked.
        img_shape (tuple[int, int]): A tuple of image height and width.
        all_inside (bool): Whether the boxes are all inside the image or
            part inside the image. Defaults to False.
        allowed_border (int): Boxes that extend beyond the image shape
            boundary by more than ``allowed_border`` are considered
            "outside" Defaults to 0.

    Returns:
        (BoolTensor): A BoolTensor indicating whether the box is inside
            the image. Assuming the original boxes have shape (m, n, 4),
            the output has shape (m, n).
    """
    img_h, img_w = img_shape
    if all_inside:
        return (
            (boxes[:, 0] >= -allowed_border)
            & (boxes[:, 1] >= -allowed_border)
            & (boxes[:, 2] < img_w + allowed_border)
            & (boxes[:, 3] < img_h + allowed_border)
        )
    return (
        (boxes[..., 0] < img_w + allowed_border)
        & (boxes[..., 1] < img_h + allowed_border)
        & (boxes[..., 2] > -allowed_border)
        & (boxes[..., 3] > -allowed_border)
    )




[docs]
def flip_bboxes(boxes: Tensor, img_shape: tuple[int, int], direction: str = "horizontal") -> Tensor:
    """Flip boxes horizontally or vertically in-place.

    Args:
        boxes (Tensor): Bounding boxes to be flipped.
        img_shape (Tuple[int, int]): A tuple of image height and width.
        direction (str): Flip direction, options are "horizontal",
            "vertical" and "diagonal". Defaults to "horizontal"

    Returns:
        (Tensor): Flipped bounding boxes.
    """
    assert direction in ["horizontal", "vertical", "diagonal"]  # noqa: S101
    flipped = boxes.clone()
    if direction == "horizontal":
        flipped[..., 0] = img_shape[1] - boxes[..., 2]
        flipped[..., 2] = img_shape[1] - boxes[..., 0]
    elif direction == "vertical":
        flipped[..., 1] = img_shape[0] - boxes[..., 3]
        flipped[..., 3] = img_shape[0] - boxes[..., 1]
    else:
        flipped[..., 0] = img_shape[1] - boxes[..., 2]
        flipped[..., 1] = img_shape[0] - boxes[..., 3]
        flipped[..., 2] = img_shape[1] - boxes[..., 0]
        flipped[..., 3] = img_shape[0] - boxes[..., 1]
    return flipped




[docs]
def overlap_bboxes(
    bboxes1: Tensor,
    bboxes2: Tensor,
    mode: str = "iou",
    is_aligned: bool = False,
    eps: float = 1e-6,
) -> Tensor:
    """Calculate overlap between two set of bboxes.

    FP16 Contributed by https://github.com/open-mmlab/mmdetection/pull/4889
    Note:
        Assume bboxes1 is M x 4, bboxes2 is N x 4, when mode is 'iou',
        there are some new generated variable when calculating IOU
        using overlap_bboxes function:

        1) is_aligned is False
            area1: M x 1
            area2: N x 1
            lt: M x N x 2
            rb: M x N x 2
            wh: M x N x 2
            overlap: M x N x 1
            union: M x N x 1
            ious: M x N x 1

            Total memory:
                S = (9 x N x M + N + M) * 4 Byte,

            When using FP16, we can reduce:
                R = (9 x N x M + N + M) * 4 / 2 Byte
                R large than (N + M) * 4 * 2 is always true when N and M >= 1.
                Obviously, N + M <= N * M < 3 * N * M, when N >=2 and M >=2,
                           N + 1 < 3 * N, when N or M is 1.

            Given M = 40 (ground truth), N = 400000 (three anchor boxes
            in per grid, FPN, R-CNNs),
                R = 275 MB (one times)

            A special case (dense detection), M = 512 (ground truth),
                R = 3516 MB = 3.43 GB

            When the batch size is B, reduce:
                B x R

            Therefore, CUDA memory runs out frequently.

            Experiments on GeForce RTX 2080Ti (11019 MiB):

            |   dtype   |   M   |   N   |   Use    |   Real   |   Ideal   |
            |:----:|:----:|:----:|:----:|:----:|:----:|
            |   FP32   |   512 | 400000 | 8020 MiB |   --   |   --   |
            |   FP16   |   512 | 400000 |   4504 MiB | 3516 MiB | 3516 MiB |
            |   FP32   |   40 | 400000 |   1540 MiB |   --   |   --   |
            |   FP16   |   40 | 400000 |   1264 MiB |   276MiB   | 275 MiB |

        2) is_aligned is True
            area1: N x 1
            area2: N x 1
            lt: N x 2
            rb: N x 2
            wh: N x 2
            overlap: N x 1
            union: N x 1
            ious: N x 1

            Total memory:
                S = 11 x N * 4 Byte

            When using FP16, we can reduce:
                R = 11 x N * 4 / 2 Byte

        So do the 'giou' (large than 'iou').

        Time-wise, FP16 is generally faster than FP32.

        When gpu_assign_thr is not -1, it takes more time on cpu
        but not reduce memory.
        There, we can reduce half the memory and keep the speed.

    If ``is_aligned`` is ``False``, then calculate the overlaps between each
    bbox of bboxes1 and bboxes2, otherwise the overlaps between each aligned
    pair of bboxes1 and bboxes2.

    Args:
        bboxes1 (Tensor): shape (B, m, 4) in <x1, y1, x2, y2> format or empty.
        bboxes2 (Tensor): shape (B, n, 4) in <x1, y1, x2, y2> format or empty.
            B indicates the batch dim, in shape (B1, B2, ..., Bn).
            If ``is_aligned`` is ``True``, then m and n must be equal.
        mode (str): "iou" (intersection over union), "iof" (intersection over
            foreground) or "giou" (generalized intersection over union).
            Default "iou".
        is_aligned (bool, optional): If True, then m and n must be equal.
            Default False.
        eps (float, optional): A value added to the denominator for numerical
            stability. Default 1e-6.

    Returns:
        Tensor: shape (m, n) if ``is_aligned`` is False else shape (m,)

    Example:
        >>> bboxes1 = torch.FloatTensor([
        >>>     [0, 0, 10, 10],
        >>>     [10, 10, 20, 20],
        >>>     [32, 32, 38, 42],
        >>> ])
        >>> bboxes2 = torch.FloatTensor([
        >>>     [0, 0, 10, 20],
        >>>     [0, 10, 10, 19],
        >>>     [10, 10, 20, 20],
        >>> ])
        >>> overlaps = overlap_bboxes(bboxes1, bboxes2)
        >>> assert overlaps.shape == (3, 3)
        >>> overlaps = overlap_bboxes(bboxes1, bboxes2, is_aligned=True)
        >>> assert overlaps.shape == (3, )

    Example:
        >>> empty = torch.empty(0, 4)
        >>> nonempty = torch.FloatTensor([[0, 0, 10, 9]])
        >>> assert tuple(overlap_bboxes(empty, nonempty).shape) == (0, 1)
        >>> assert tuple(overlap_bboxes(nonempty, empty).shape) == (1, 0)
        >>> assert tuple(overlap_bboxes(empty, empty).shape) == (0, 0)
    """
    assert mode in ["iou", "iof", "giou"], f"Unsupported mode {mode}"  # noqa: S101
    # Either the boxes are empty or the length of boxes' last dimension is 4
    assert bboxes1.size(-1) == 4 or bboxes1.size(0) == 0  # noqa: S101
    assert bboxes2.size(-1) == 4 or bboxes2.size(0) == 0  # noqa: S101

    # Batch dim must be the same
    # Batch dim: (B1, B2, ... Bn)
    assert bboxes1.shape[:-2] == bboxes2.shape[:-2]  # noqa: S101
    batch_shape = bboxes1.shape[:-2]

    rows = bboxes1.size(-2)
    cols = bboxes2.size(-2)
    if is_aligned:
        assert rows == cols  # noqa: S101

    if rows * cols == 0:
        if is_aligned:
            return bboxes1.new((*batch_shape, rows))
        return bboxes1.new((*batch_shape, rows, cols))

    area1 = (bboxes1[..., 2] - bboxes1[..., 0]) * (bboxes1[..., 3] - bboxes1[..., 1])
    area2 = (bboxes2[..., 2] - bboxes2[..., 0]) * (bboxes2[..., 3] - bboxes2[..., 1])

    if is_aligned:
        lt = torch.max(bboxes1[..., :2], bboxes2[..., :2])  # [B, rows, 2]
        rb = torch.min(bboxes1[..., 2:], bboxes2[..., 2:])  # [B, rows, 2]

        wh = fp16_clamp(rb - lt, min=0)
        overlap = wh[..., 0] * wh[..., 1]

        union = area1 + area2 - overlap if mode in ["iou", "giou"] else area1
        if mode == "giou":
            enclosed_lt = torch.min(bboxes1[..., :2], bboxes2[..., :2])
            enclosed_rb = torch.max(bboxes1[..., 2:], bboxes2[..., 2:])
    else:
        lt = torch.max(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2])  # [B, rows, cols, 2]
        rb = torch.min(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:])  # [B, rows, cols, 2]

        wh = fp16_clamp(rb - lt, min=0)
        overlap = wh[..., 0] * wh[..., 1]

        union = area1[..., None] + area2[..., None, :] - overlap if mode in ["iou", "giou"] else area1[..., None]
        if mode == "giou":
            enclosed_lt = torch.min(bboxes1[..., :, None, :2], bboxes2[..., None, :, :2])
            enclosed_rb = torch.max(bboxes1[..., :, None, 2:], bboxes2[..., None, :, 2:])

    eps = union.new_tensor([eps])
    union = torch.max(union, eps)
    ious = overlap / union
    if mode in ["iou", "iof"]:
        return ious
    # calculate gious
    enclose_wh = fp16_clamp(enclosed_rb - enclosed_lt, min=0)
    enclose_area = enclose_wh[..., 0] * enclose_wh[..., 1]
    enclose_area = torch.max(enclose_area, eps)
    return ious - (enclose_area - union) / enclose_area




[docs]
def centers_bboxes(boxes: Tensor) -> Tensor:
    """Return a tensor representing the centers of boxes."""
    return (boxes[..., :2] + boxes[..., 2:]) / 2




[docs]
def fp16_clamp(x: Tensor, min: float | None = None, max: float | None = None) -> Tensor:  # noqa: A002
    """Clamp fp16 tensor."""
    if not x.is_cuda and x.dtype == torch.float16:
        # clamp for cpu float16, tensor fp16 has no clamp implementation
        return x.float().clamp(min, max).half()

    return x.clamp(min, max)




[docs]
def scale_size(
    size: tuple[int, int],
    scale: float | int | tuple[float, float] | tuple[int, int],
) -> tuple[int, int]:
    """Rescale a size by a ratio.

    Args:
        size (tuple[int]): (height, width).
        scale (float | int | tuple(float) | tuple(int)): Scaling factor with (height, width).

    Returns:
        tuple[int]: scaled size with (height, width).
    """
    if isinstance(scale, (float, int)):
        scale = (scale, scale)
    h, w = size
    return int(h * float(scale[0]) + 0.5), int(w * float(scale[1]) + 0.5)




[docs]
def rescale_size(
    old_size: tuple,
    scale: float | int | tuple[float, float] | tuple[int, int],
    return_scale: bool = False,
) -> tuple[int, int] | tuple[tuple[int, int], float | int]:
    """Calculate the new size to be rescaled to.

    Args:
        old_size (tuple[int]): The old size (height, width) of image.
        scale (float | int | tuple[float] | tuple[int]): The scaling factor or maximum size.
            If it is a float number, an integer, or a tuple of 2 float numbers,
            then the image will be rescaled by this factor, else if it is a tuple of 2 integers,
            then the image will be rescaled as large as possible within the scale.
        return_scale (bool): Whether to return the scaling factor besides the rescaled image size.

    Returns:
        tuple[int]: The new rescaled image size with (height, width).
            If return_scale is True, scale_factor obtained again will be returned as well.
    """
    h, w = old_size
    msg = ""
    if isinstance(scale, (float, int)):
        if scale <= 0:
            msg = f"Invalid scale {scale}, must be positive."
            raise ValueError(msg)
        scale_factor = scale
    elif isinstance(scale, tuple):
        if isinstance(scale[0], int):
            max_long_edge = max(scale)
            max_short_edge = min(scale)
            scale_factor = min(max_long_edge / max(h, w), max_short_edge / min(h, w))
        elif isinstance(scale[0], float):
            scale_factor = scale  # type: ignore[assignment]
        else:
            msg = f"Scale must be a number or tuple of int/float, but got tuple of {type(scale[0])}"
    else:
        msg = f"Scale must be a number or tuple of int/float, but got {type(scale)}"

    if msg:
        raise TypeError(msg)

    new_size = scale_size((h, w), scale_factor)

    if return_scale:
        return new_size, scale_factor
    return new_size




[docs]
def flip_image(img: np.ndarray | list[np.ndarray], direction: str = "horizontal") -> np.ndarray | list[np.ndarray]:
    """Flip an image horizontally or vertically.

    Args:
        img (ndarray): Image to be flipped.
        direction (str): The flip direction, either "horizontal" or
            "vertical" or "diagonal".

    Returns:
        ndarray: The flipped image.
    """
    if direction not in ["horizontal", "vertical", "diagonal"]:
        msg = f"direction (={direction}) should be in one of ('horizontal', 'vertical', 'diagonal')."
        raise ValueError(msg)

    if isinstance(img, list):
        return [flip_image(im, direction) for im in img]

    if direction == "horizontal":
        return np.flip(img, axis=1)
    elif direction == "vertical":  # noqa: RET505
        return np.flip(img, axis=0)
    else:
        return np.flip(img, axis=(0, 1))




[docs]
def flip_masks(masks: np.ndarray, direction: str = "horizontal") -> np.ndarray:
    """Flip masks alone the given direction."""
    assert direction in ("horizontal", "vertical", "diagonal")  # noqa: S101

    return np.stack([flip_image(mask, direction=direction) for mask in masks])




[docs]
def flip_polygons(polygons: list[Polygon], height: int, width: int, direction: str = "horizontal") -> list[Polygon]:
    """Flip polygons alone the given direction."""
    for polygon in polygons:
        p = np.asarray(polygon.points)
        if direction == "horizontal":
            p[0::2] = width - p[0::2]
        elif direction == "vertical":
            p[1::2] = height - p[1::2]
        else:
            p[0::2] = width - p[0::2]
            p[1::2] = height - p[1::2]
        polygon.points = p.tolist()
    return polygons




[docs]
def project_bboxes(boxes: Tensor, homography_matrix: Tensor | np.ndarray) -> Tensor:
    """Geometric transformat boxes in-place.

    Reference : https://github.com/open-mmlab/mmdetection/blob/v3.2.0/mmdet/structures/bbox/horizontal_boxes.py#L184-L202

    Args:
        homography_matrix (Tensor or np.ndarray]):
            Shape (3, 3) for geometric transformation.

    Returns:
        (Tensor | np.ndarray): Projected bounding boxes.
    """
    if isinstance(homography_matrix, np.ndarray):
        homography_matrix = boxes.new_tensor(homography_matrix)
    corners = hbox2corner(boxes)
    corners = torch.cat([corners, corners.new_ones(*corners.shape[:-1], 1)], dim=-1)
    corners_t = torch.transpose(corners, -1, -2)
    corners_t = torch.matmul(homography_matrix, corners_t)
    corners = torch.transpose(corners_t, -1, -2)
    # Convert to homogeneous coordinates by normalization
    corners = corners[..., :2] / corners[..., 2:3]
    return corner2hbox(corners)




[docs]
def hbox2corner(boxes: Tensor) -> Tensor:
    """Convert box coordinates from (x1, y1, x2, y2) to corners ((x1, y1), (x2, y1), (x1, y2), (x2, y2)).

    Reference : https://github.com/open-mmlab/mmdetection/blob/v3.2.0/mmdet/structures/bbox/horizontal_boxes.py#L204-L217

    Args:
        boxes (Tensor): Horizontal box tensor with shape of (..., 4).

    Returns:
        Tensor: Corner tensor with shape of (..., 4, 2).
    """
    x1, y1, x2, y2 = torch.split(boxes, 1, dim=-1)
    corners = torch.cat([x1, y1, x2, y1, x1, y2, x2, y2], dim=-1)
    return corners.reshape(*corners.shape[:-1], 4, 2)




[docs]
def corner2hbox(corners: Tensor) -> Tensor:
    """Convert box coordinates from corners ((x1, y1), (x2, y1), (x1, y2), (x2, y2)) to (x1, y1, x2, y2).

    Reference : https://github.com/open-mmlab/mmdetection/blob/v3.2.0/mmdet/structures/bbox/horizontal_boxes.py#L219-L234

    Args:
        corners (Tensor): Corner tensor with shape of (..., 4, 2).

    Returns:
        Tensor: Horizontal box tensor with shape of (..., 4).
    """
    if corners.numel() == 0:
        return corners.new_zeros((0, 4))
    min_xy = corners.min(dim=-2)[0]
    max_xy = corners.max(dim=-2)[0]
    return torch.cat([min_xy, max_xy], dim=-1)




[docs]
def crop_masks(masks: np.ndarray, bbox: np.ndarray) -> np.ndarray:
    """Crop each mask by the given bbox."""
    assert isinstance(bbox, np.ndarray)  # noqa: S101
    assert bbox.ndim == 1  # noqa: S101

    height, width = masks.shape[1:]

    # clip the boundary
    bbox = bbox.copy()
    bbox[0::2] = np.clip(bbox[0::2], 0, width)
    bbox[1::2] = np.clip(bbox[1::2], 0, height)
    x1, y1, x2, y2 = bbox
    w = np.maximum(x2 - x1, 1)
    h = np.maximum(y2 - y1, 1)

    return masks[:, y1 : y1 + h, x1 : x1 + w]




[docs]
def crop_polygons(polygons: list[Polygon], bbox: np.ndarray, height: int, width: int) -> list[Polygon]:
    """Crop each polygon by the given bbox."""
    assert isinstance(bbox, np.ndarray)  # noqa: S101
    assert bbox.ndim == 1  # noqa: S101

    # clip the boundary
    bbox = bbox.copy()
    bbox[0::2] = np.clip(bbox[0::2], 0, width)
    bbox[1::2] = np.clip(bbox[1::2], 0, height)
    x1, y1, x2, y2 = bbox

    # reference: https://github.com/facebookresearch/fvcore/blob/main/fvcore/transforms/transform.py
    crop_box = geometry.box(x1, y1, x2, y2).buffer(0.0)
    # suppress shapely warnings util it incorporates GEOS>=3.11.2
    # reference: https://github.com/shapely/shapely/issues/1345
    initial_settings = np.seterr()
    np.seterr(invalid="ignore")
    for polygon in polygons:
        cropped_poly_per_obj: list[Polygon] = []

        p = np.asarray(polygon.points).copy()
        p = geometry.Polygon(p.reshape(-1, 2)).buffer(0.0)
        # polygon must be valid to perform intersection.
        if not p.is_valid:
            # a dummy polygon to avoid misalignment between masks and boxes
            polygon.points = [0, 0, 0, 0, 0, 0]
            continue

        cropped = p.intersection(crop_box)
        if cropped.is_empty:
            # a dummy polygon to avoid misalignment between masks and boxes
            polygon.points = [0, 0, 0, 0, 0, 0]
            continue

        cropped = cropped.geoms if isinstance(cropped, geometry.collection.BaseMultipartGeometry) else [cropped]

        # one polygon may be cropped to multiple ones
        for poly in cropped:
            # ignore lines or points
            if not isinstance(poly, geometry.Polygon) or not poly.is_valid:
                continue

            coords = np.asarray(poly.exterior.coords)

            # remove an extra identical vertex at the end
            coords = coords[:-1]
            coords[:, 0] -= x1
            coords[:, 1] -= y1
            cropped_poly_per_obj.append(coords.reshape(-1).tolist())

        # a dummy polygon to avoid misalignment between masks and boxes
        if len(cropped_poly_per_obj) == 0:
            cropped_poly_per_obj.append([0, 0, 0, 0, 0, 0])

        polygon.points = list(itertools.chain(*cropped_poly_per_obj))
    np.seterr(**initial_settings)
    return polygons




[docs]
def get_bboxes_from_masks(masks: Tensor) -> np.ndarray:
    """Create boxes from masks."""
    num_masks = len(masks)
    bboxes = np.zeros((num_masks, 4), dtype=np.float32)

    x_any = masks.any(axis=1)
    y_any = masks.any(axis=2)
    for idx in range(num_masks):
        x = np.where(x_any[idx, :])[0]
        y = np.where(y_any[idx, :])[0]
        if len(x) > 0 and len(y) > 0:
            # use +1 for x_max and y_max so that the right and bottom
            # boundary of instance masks are fully included by the box
            bboxes[idx, :] = np.array([x[0], y[0], x[-1] + 1, y[-1] + 1], dtype=np.float32)
    return bboxes




[docs]
def get_bboxes_from_polygons(polygons: list[Polygon], height: int, width: int) -> np.ndarray:
    """Create boxes from polygons."""
    num_polygons = len(polygons)
    boxes = np.zeros((num_polygons, 4), dtype=np.float32)
    for idx, polygon in enumerate(polygons):
        # simply use a number that is big enough for comparison with coordinates
        xy_min = np.array([width * 2, height * 2], dtype=np.float32)
        xy_max = np.zeros(2, dtype=np.float32)

        xy = np.array(polygon.points).reshape(-1, 2).astype(np.float32)
        xy_min = np.minimum(xy_min, np.min(xy, axis=0))
        xy_max = np.maximum(xy_max, np.max(xy, axis=0))
        boxes[idx, :2] = xy_min
        boxes[idx, 2:] = xy_max
    return boxes




[docs]
def area_polygon(x: np.ndarray, y: np.ndarray) -> np.ndarray:
    """Compute the area of a component of a polygon.

    Using the shoelace formula:
    https://stackoverflow.com/questions/24467972/calculate-area-of-polygon-given-x-y-coordinates

    Args:
        x (ndarray): x coordinates of the component
        y (ndarray): y coordinates of the component

    Return:
        (float): the are of the component
    """
    return 0.5 * np.abs(np.dot(x, np.roll(y, 1)) - np.dot(y, np.roll(x, 1)))