diff --git a/gallery/transforms/plot_rotated_box_transforms.py b/gallery/transforms/plot_rotated_box_transforms.py index ddc4a200417..7c6e3a559df 100644 --- a/gallery/transforms/plot_rotated_box_transforms.py +++ b/gallery/transforms/plot_rotated_box_transforms.py @@ -3,10 +3,13 @@ Transforms on Rotated Bounding Boxes =============================================================== -This example illustrates how to define and use rotated bounding boxes. We'll -cover how to define them, demonstrate their usage with some of the existing -transforms, and finally some of their unique behavior in comparision to -standard bounding boxes. +This example illustrates how to define and use rotated bounding boxes. + +.. note:: + Support for rotated bounding boxes was released in TorchVision 0.23 and is + currently a BETA feature. We don't expect the API to change, but there may + be some rare edge-cases. If you find any issues, please report them on + our bug tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue First, a bit of setup code: """ @@ -18,7 +21,7 @@ import torch -from torchvision import tv_tensors +from torchvision.tv_tensors import BoundingBoxes from torchvision.transforms import v2 from helpers import plot @@ -37,16 +40,16 @@ # Creating a Rotated Bounding Box # ------------------------------- # Rotated bounding boxes are created by instantiating the -# :class:`~torchvision.tv_tensors.BoundingBoxes` class. It's the `format` +# :class:`~torchvision.tv_tensors.BoundingBoxes` class. It's the ``format`` # parameter of the constructor that determines if a bounding box is rotated or -# not. In this instance, we use the -# :attr:`~torchvision.tv_tensors.BoundingBoxFormat` kind `CXCYWHR`. The first -# two values are the `x` and `y` coordinates of the center of the bounding box. -# The next two values are the `width` and `height` of the bounding box, and the -# last value is the `rotation` of the bounding box. +# not. In this instance, we use the CXCYWHR +# :attr:`~torchvision.tv_tensors.BoundingBoxFormat`. The first two values are +# the X and Y coordinates of the center of the bounding box. The next two +# values are the width and height of the bounding box, and the last value is the +# rotation of the bounding box, in degrees. -orig_box = tv_tensors.BoundingBoxes( +orig_box = BoundingBoxes( [ [860.0, 1100, 570, 1840, -7], ], @@ -57,20 +60,16 @@ plot([(orig_img, orig_box)], bbox_width=10) # %% -# Rotation -# -------- -# Rotated bounding boxes maintain their rotation with respect to the image even -# when the image itself is rotated through the -# :class:`~torchvision.transforms.RandomRotation` transform. +# Transforms illustrations +# ------------------------ +# +# Using :class:`~torchvision.transforms.RandomRotation`: rotater = v2.RandomRotation(degrees=(0, 180), expand=True) rotated_imgs = [rotater((orig_img, orig_box)) for _ in range(4)] plot([(orig_img, orig_box)] + rotated_imgs, bbox_width=10) # %% -# Padding -# ------- -# Rotated bounding boxes also maintain their properties when the image is padded using -# :class:`~torchvision.transforms.Pad`. +# Using :class:`~torchvision.transforms.Pad`: padded_imgs_and_boxes = [ v2.Pad(padding=padding)(orig_img, orig_box) for padding in (30, 50, 100, 200) @@ -78,16 +77,7 @@ plot([(orig_img, orig_box)] + padded_imgs_and_boxes, bbox_width=10) # %% -# Resizing -# -------- -# Rotated bounding boxes are also resized along with an image in the -# :class:`~torchvision.transforms.Resize` transform. -# -# Note that the bounding box looking bigger in the images with less pixels is -# an artifact, not reality. That is merely the rasterised representation of the -# bounding box's boundaries appearing bigger because we specify a fixed width of -# that rasterized line. When the image is, say, only 30 pixels wide, a -# line that is 3 pixels wide is relatively large. +# Using :class:`~torchvision.transforms.Resize`: resized_imgs = [ v2.Resize(size=size)(orig_img, orig_box) for size in (30, 50, 100, orig_img.size) @@ -95,62 +85,111 @@ plot([(orig_img, orig_box)] + resized_imgs, bbox_width=5) # %% -# Perspective -# ----------- -# The rotated bounding box is also transformed along with the image when the -# perspective is transformed with :class:`~torchvision.transforms.RandomPerspective`. -perspective_transformer = v2.RandomPerspective(distortion_scale=0.6, p=1.0) -perspective_imgs = [perspective_transformer(orig_img, orig_box) for _ in range(4)] -plot([(orig_img, orig_box)] + perspective_imgs, bbox_width=10) - -# %% -# Elastic Transform -# ----------------- -# The rotated bounding box is appropriately unchanged when going through the -# :class:`~torchvision.transforms.ElasticTransform`. -elastic_imgs = [ - v2.ElasticTransform(alpha=alpha)(orig_img, orig_box) - for alpha in (100.0, 500.0, 1000.0, 2000.0) -] -plot([(orig_img, orig_box)] + elastic_imgs, bbox_width=10) - -# %% -# Crop & Clamping Modes -# --------------------- -# The :class:`~torchvision.transforms.CenterCrop` transform selectively crops -# the image on a center location. The behavior of the rotated bounding box -# depends on its `clamping_mode`. We can set the `clamping_mode` in the -# :class:`~torchvision.tv_tensors.BoundingBoxes` constructur, or by directly -# setting it after construction as we do in the example below. +# Note that the bounding box looking bigger in the images with less pixels is +# an artifact, not reality. That is merely the rasterised representation of the +# bounding box's boundaries appearing bigger because we specify a fixed width of +# that rasterized line. When the image is, say, only 30 pixels wide, a +# line that is 3 pixels wide is relatively large. # -# There are two values for `clamping_mode`: +# .. _clamping_mode_tuto: # -# - `"soft"`: The default when constucting -# :class:`~torchvision.tv_tensors.BoundingBoxes`. -# - `"hard"`: +# Clamping Mode, and its effect on transforms +# ------------------------------------------- # -# For standard bounding boxes, both modes behave the same. We also need to -# document: +# Some transforms, such as :class:`~torchvision.transforms.CenterCrop`, may +# result in having the transformed bounding box partially outside of the +# transformed (cropped) image. In general, this may happen on most of the +# :ref:`geometric transforms `. # -# - `clamping_mode` for individual kernels. -# - `clamping_mode` in :class:`~torchvision.transforms.v2.ClampBoundingBoxes`. -# - the new :class:`~torchvision.transforms.v2.SetClampingMode` transform. +# In such cases, the bounding box is clamped to the transformed image size based +# on its ``clamping_mode`` attribute. There are three values for +# ``clamping_mode``, which determines how the box is clamped after a +# transformation: # +# - ``None``: No clamping is applied, and the bounding box may be partially +# outside of the image. +# - `"hard"`: The box is clamped to the image size, such that all its corners +# are within the image canvas. This potentially results in a loss of +# information, and it can lead to unintuitive resuts. But may be necessary +# for some applications e.g. if the model doesn't support boxes outside of +# their image. +# - `"soft"`: . This is an intermediate mode between ``None`` and "hard": the +# box is clamped, but not as strictly as in "hard" mode. Some box dimensions +# may still be outside of the image. This is the default when constucting +# :class:`~torchvision.tv_tensors.BoundingBoxes`. +# +# .. note:: +# +# For axis-aligned bounding boxes, the `"soft"` and `"hard"` modes behave +# the same, as the bounding box is always clamped to the image size. +# +# Let's illustrate the clamping modes with +# :class:`~torchvision.transforms.CenterCrop` transform: + assert orig_box.clamping_mode == "soft" -hard_box = orig_box.clone() -hard_box.clamping_mode = "hard" +box_hard_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode="hard") + +box_no_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode=None) + +crop_sizes = (800, 1200, 2000, orig_img.size) soft_center_crops_and_boxes = [ v2.CenterCrop(size=size)(orig_img, orig_box) - for size in (800, 1200, 2000, orig_img.size) + for size in crop_sizes ] hard_center_crops_and_boxes = [ - v2.CenterCrop(size=size)(orig_img, hard_box) - for size in (800, 1200, 2000, orig_img.size) + v2.CenterCrop(size=size)(orig_img, box_hard_clamping) + for size in crop_sizes +] + +no_clamping_center_crops_and_boxes = [ + v2.CenterCrop(size=size)(orig_img, box_no_clamping) + for size in crop_sizes ] -plot([[(orig_img, orig_box)] + soft_center_crops_and_boxes, - [(orig_img, hard_box)] + hard_center_crops_and_boxes], +plot([[(orig_img, box_hard_clamping)] + hard_center_crops_and_boxes, + [(orig_img, orig_box)] + soft_center_crops_and_boxes, + [(orig_img, box_no_clamping)] + no_clamping_center_crops_and_boxes], bbox_width=10) + +# %% +# The plot above shows the "hard" clamping mode, "soft" and ``None``, in this +# order. While "soft" and ``None`` result in similar plots, they do not lead to +# the exact same clamped boxes. The non-clamped boxes will show dimensions that are further away from the image: +print("boxes with soft clamping:") +print(soft_center_crops_and_boxes) +print() +print("boxes with no clamping:") +print(no_clamping_center_crops_and_boxes) + +# %% +# +# Setting the clamping mode +# -------------------------- +# +# The ``clamping_mode`` attribute, which determines the clamping strategy that +# is applied to a box, can be set in different ways: +# +# - When constructing the bounding box with its +# :class:`~torchvision.tv_tensors.BoundingBoxes` constructor, as done in the example above. +# - By directly setting the attribute on an existing instance, e.g. ``boxes.clamping_mode = "hard"``. +# - By calling the :class:`~torchvision.transforms.v2.SetClampingMode` transform. +# +# Also, remember that you can always clamp the bounding box manually by +# calling the :meth:`~torchvision.transforms.v2.ClampBoundingBoxes` transform! +# Here's an example illustrating all of these option: + +t = v2.Compose([ + v2.CenterCrop(size=(800,)), # clamps according to the current clamping_mode + # attribute, in this case set by the constructor + v2.SetClampingMode(None), # sets the clamping_mode attribute for future transforms + v2.Pad(padding=3), # clamps according to the current clamping_mode + # i.e. ``None`` + v2.ClampBoundingBoxes(clamping_mode="soft"), # clamps with "soft" mode. +]) + +out_img, out_box = t(orig_img, orig_box) +plot([(orig_img, orig_box), (out_img, out_box)], bbox_width=10) + +# %% diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index 68395b468ba..39f223f0398 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -27,11 +27,10 @@ def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> t class ClampBoundingBoxes(Transform): """Clamp bounding boxes to their corresponding image dimensions. - The clamping is done according to the bounding boxes' ``canvas_size`` meta-data. - Args: - clamping_mode: TODOBB more docs. Default is None which relies on the input box' clamping_mode attribute. - + clamping_mode: Default is "auto" which relies on the input box' + ``clamping_mode`` attribute. Read more in :ref:`clamping_mode_tuto` + for more details on how to use this transform. """ def __init__(self, clamping_mode: Union[CLAMPING_MODE_TYPE, str] = "auto") -> None: @@ -57,7 +56,15 @@ def transform(self, inpt: tv_tensors.KeyPoints, params: dict[str, Any]) -> tv_te class SetClampingMode(Transform): - """TODOBB""" + """Sets the ``clamping_mode`` attribute of the bounding boxes for future transforms. + + + + Args: + clamping_mode: The clamping mode to set. Possible values are: "soft", + "hard", or ``None``. Read more in :ref:`clamping_mode_tuto` for more + details on how to use this transform. + """ def __init__(self, clamping_mode: CLAMPING_MODE_TYPE) -> None: super().__init__() diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py index c162bee9949..7aa3e50458d 100644 --- a/torchvision/tv_tensors/_bounding_boxes.py +++ b/torchvision/tv_tensors/_bounding_boxes.py @@ -59,12 +59,17 @@ def is_rotated_bounding_format(format: BoundingBoxFormat | str) -> bool: # This should ideally be a Literal, but torchscript fails. CLAMPING_MODE_TYPE = Optional[str] -# TODOBB All docs. Add any new API to rst files, add tutorial[s]. - class BoundingBoxes(TVTensor): """:class:`torch.Tensor` subclass for bounding boxes with shape ``[N, K]``. + .. note:: + Support for rotated bounding boxes was released in TorchVision 0.23 and + is currently a BETA feature. We don't expect the API to change, but + there may be some rare edge-cases. If you find any issues, please report + them on our bug tracker: + https://github.com/pytorch/vision/issues?q=is:open+is:issue + Where ``N`` is the number of bounding boxes and ``K`` is 4 for unrotated boxes, and 5 or 8 for rotated boxes. @@ -78,7 +83,8 @@ class BoundingBoxes(TVTensor): data: Any data that can be turned into a tensor with :func:`torch.as_tensor`. format (BoundingBoxFormat, str): Format of the bounding box. canvas_size (two-tuple of ints): Height and width of the corresponding image or video. - clamping_mode: TODOBB + clamping_mode: The clamping mode to use when applying transforms that may result in bounding boxes + partially outside of the image. Possible values are: "soft", "hard", or ``None``. Read more in :ref:`clamping_mode_tuto`. dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from ``data``. device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a diff --git a/torchvision/tv_tensors/_keypoints.py b/torchvision/tv_tensors/_keypoints.py index 48c3d0a5c02..aede31ad7db 100644 --- a/torchvision/tv_tensors/_keypoints.py +++ b/torchvision/tv_tensors/_keypoints.py @@ -11,6 +11,14 @@ class KeyPoints(TVTensor): """:class:`torch.Tensor` subclass for tensors with shape ``[..., 2]`` that represent points in an image. + .. note:: + Support for keypoints was released in TorchVision 0.23 and is currently + a BETA feature. We don't expect the API to change, but there may be some + rare edge-cases. If you find any issues, please report them on our bug + tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue + Each point is represented by its X and Y coordinates along the width and + height dimensions, respectively. + Each point is represented by its X and Y coordinates along the width and height dimensions, respectively. KeyPoints may represent any object that can be represented by sequences of 2D points: