diff --git a/docs/source/conf.py b/docs/source/conf.py index df6cca3856a..26771a7b711 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -87,6 +87,7 @@ def __init__(self, src_dir): "plot_transforms_illustrations.py", "plot_transforms_e2e.py", "plot_cutmix_mixup.py", + "plot_rotated_box_transforms.py", "plot_custom_transforms.py", "plot_tv_tensors.py", "plot_custom_tv_tensors.py", diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 4b00fab023d..44b4cc3aaa5 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -1,14 +1,20 @@ .. _transforms: -Transforming and augmenting images -================================== +Transforming images, videos, boxes and more +=========================================== .. currentmodule:: torchvision.transforms Torchvision supports common computer vision transformations in the -``torchvision.transforms`` and ``torchvision.transforms.v2`` modules. Transforms -can be used to transform or augment data for training or inference of different -tasks (image classification, detection, segmentation, video classification). +``torchvision.transforms.v2`` module. Transforms can be used to transform and +augment data, for both training or inference. The following objects are +supported: + +- Images as pure tensors, :class:`~torchvision.tv_tensors.Image` or PIL image +- Videos as :class:`~torchvision.tv_tensors.Video` +- Axis-aligned and rotated bounding boxes as :class:`~torchvision.tv_tensors.BoundingBoxes` +- Segmentation and detection masks as :class:`~torchvision.tv_tensors.Mask` +- KeyPoints as :class:`~torchvision.tv_tensors.KeyPoints`. .. code:: python @@ -111,9 +117,9 @@ In Torchvision 0.15 (March 2023), we released a new set of transforms available in the ``torchvision.transforms.v2`` namespace. These transforms have a lot of advantages compared to the v1 ones (in ``torchvision.transforms``): -- They can transform images **but also** bounding boxes, masks, or videos. This - provides support for tasks beyond image classification: detection, segmentation, - video classification, etc. See +- They can transform images **and also** bounding boxes, masks, videos and + keypoints. This provides support for tasks beyond image classification: + detection, segmentation, video classification, pose estimation, etc. See :ref:`sphx_glr_auto_examples_transforms_plot_transforms_getting_started.py` and :ref:`sphx_glr_auto_examples_transforms_plot_transforms_e2e.py`. - They support more transforms like :class:`~torchvision.transforms.v2.CutMix` diff --git a/gallery/assets/leaning_tower.jpg b/gallery/assets/leaning_tower.jpg new file mode 100644 index 00000000000..fc6e0779f7c Binary files /dev/null and b/gallery/assets/leaning_tower.jpg differ diff --git a/gallery/transforms/helpers.py b/gallery/transforms/helpers.py index e94d717eb7d..bc8de0d2ad1 100644 --- a/gallery/transforms/helpers.py +++ b/gallery/transforms/helpers.py @@ -2,10 +2,11 @@ import torch from torchvision.utils import draw_bounding_boxes, draw_segmentation_masks from torchvision import tv_tensors +from torchvision.transforms import v2 from torchvision.transforms.v2 import functional as F -def plot(imgs, row_title=None, **imshow_kwargs): +def plot(imgs, row_title=None, bbox_width=3, **imshow_kwargs): if not isinstance(imgs[0], list): # Make a 2d grid even if there's just 1 row imgs = [imgs] @@ -24,6 +25,11 @@ def plot(imgs, row_title=None, **imshow_kwargs): masks = target.get("masks") elif isinstance(target, tv_tensors.BoundingBoxes): boxes = target + + # Conversion necessary because draw_bounding_boxes() only + # work with this specific format. + if tv_tensors.is_rotated_bounding_format(boxes.format): + boxes = v2.ConvertBoundingBoxFormat("xyxyxyxy")(boxes) else: raise ValueError(f"Unexpected target type: {type(target)}") img = F.to_image(img) @@ -35,7 +41,7 @@ def plot(imgs, row_title=None, **imshow_kwargs): img = F.to_dtype(img, torch.uint8, scale=True) if boxes is not None: - img = draw_bounding_boxes(img, boxes, colors="yellow", width=3) + img = draw_bounding_boxes(img, boxes, colors="yellow", width=bbox_width) if masks is not None: img = draw_segmentation_masks(img, masks.to(torch.bool), colors=["green"] * masks.shape[0], alpha=.65) diff --git a/gallery/transforms/plot_rotated_box_transforms.py b/gallery/transforms/plot_rotated_box_transforms.py new file mode 100644 index 00000000000..7c6e3a559df --- /dev/null +++ b/gallery/transforms/plot_rotated_box_transforms.py @@ -0,0 +1,195 @@ +""" +=============================================================== +Transforms on Rotated Bounding Boxes +=============================================================== + +This example illustrates how to define and use rotated bounding boxes. + +.. note:: + Support for rotated bounding boxes was released in TorchVision 0.23 and is + currently a BETA feature. We don't expect the API to change, but there may + be some rare edge-cases. If you find any issues, please report them on + our bug tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue + +First, a bit of setup code: +""" + +# %% +from PIL import Image +from pathlib import Path +import matplotlib.pyplot as plt + + +import torch +from torchvision.tv_tensors import BoundingBoxes +from torchvision.transforms import v2 +from helpers import plot + +plt.rcParams["figure.figsize"] = [10, 5] +plt.rcParams["savefig.bbox"] = "tight" + +# if you change the seed, make sure that the randomly-applied transforms +# properly show that the image can be both transformed and *not* transformed! +torch.manual_seed(0) + +# If you're trying to run that on Colab, you can download the assets and the +# helpers from https://github.com/pytorch/vision/tree/main/gallery/ +orig_img = Image.open(Path('../assets') / 'leaning_tower.jpg') + +# %% +# Creating a Rotated Bounding Box +# ------------------------------- +# Rotated bounding boxes are created by instantiating the +# :class:`~torchvision.tv_tensors.BoundingBoxes` class. It's the ``format`` +# parameter of the constructor that determines if a bounding box is rotated or +# not. In this instance, we use the CXCYWHR +# :attr:`~torchvision.tv_tensors.BoundingBoxFormat`. The first two values are +# the X and Y coordinates of the center of the bounding box. The next two +# values are the width and height of the bounding box, and the last value is the +# rotation of the bounding box, in degrees. + + +orig_box = BoundingBoxes( + [ + [860.0, 1100, 570, 1840, -7], + ], + format="CXCYWHR", + canvas_size=(orig_img.size[1], orig_img.size[0]), +) + +plot([(orig_img, orig_box)], bbox_width=10) + +# %% +# Transforms illustrations +# ------------------------ +# +# Using :class:`~torchvision.transforms.RandomRotation`: +rotater = v2.RandomRotation(degrees=(0, 180), expand=True) +rotated_imgs = [rotater((orig_img, orig_box)) for _ in range(4)] +plot([(orig_img, orig_box)] + rotated_imgs, bbox_width=10) + +# %% +# Using :class:`~torchvision.transforms.Pad`: +padded_imgs_and_boxes = [ + v2.Pad(padding=padding)(orig_img, orig_box) + for padding in (30, 50, 100, 200) +] +plot([(orig_img, orig_box)] + padded_imgs_and_boxes, bbox_width=10) + +# %% +# Using :class:`~torchvision.transforms.Resize`: +resized_imgs = [ + v2.Resize(size=size)(orig_img, orig_box) + for size in (30, 50, 100, orig_img.size) +] +plot([(orig_img, orig_box)] + resized_imgs, bbox_width=5) + +# %% +# Note that the bounding box looking bigger in the images with less pixels is +# an artifact, not reality. That is merely the rasterised representation of the +# bounding box's boundaries appearing bigger because we specify a fixed width of +# that rasterized line. When the image is, say, only 30 pixels wide, a +# line that is 3 pixels wide is relatively large. +# +# .. _clamping_mode_tuto: +# +# Clamping Mode, and its effect on transforms +# ------------------------------------------- +# +# Some transforms, such as :class:`~torchvision.transforms.CenterCrop`, may +# result in having the transformed bounding box partially outside of the +# transformed (cropped) image. In general, this may happen on most of the +# :ref:`geometric transforms `. +# +# In such cases, the bounding box is clamped to the transformed image size based +# on its ``clamping_mode`` attribute. There are three values for +# ``clamping_mode``, which determines how the box is clamped after a +# transformation: +# +# - ``None``: No clamping is applied, and the bounding box may be partially +# outside of the image. +# - `"hard"`: The box is clamped to the image size, such that all its corners +# are within the image canvas. This potentially results in a loss of +# information, and it can lead to unintuitive resuts. But may be necessary +# for some applications e.g. if the model doesn't support boxes outside of +# their image. +# - `"soft"`: . This is an intermediate mode between ``None`` and "hard": the +# box is clamped, but not as strictly as in "hard" mode. Some box dimensions +# may still be outside of the image. This is the default when constucting +# :class:`~torchvision.tv_tensors.BoundingBoxes`. +# +# .. note:: +# +# For axis-aligned bounding boxes, the `"soft"` and `"hard"` modes behave +# the same, as the bounding box is always clamped to the image size. +# +# Let's illustrate the clamping modes with +# :class:`~torchvision.transforms.CenterCrop` transform: + +assert orig_box.clamping_mode == "soft" + +box_hard_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode="hard") + +box_no_clamping = BoundingBoxes(orig_box, format=orig_box.format, canvas_size=orig_box.canvas_size, clamping_mode=None) + +crop_sizes = (800, 1200, 2000, orig_img.size) +soft_center_crops_and_boxes = [ + v2.CenterCrop(size=size)(orig_img, orig_box) + for size in crop_sizes +] + +hard_center_crops_and_boxes = [ + v2.CenterCrop(size=size)(orig_img, box_hard_clamping) + for size in crop_sizes +] + +no_clamping_center_crops_and_boxes = [ + v2.CenterCrop(size=size)(orig_img, box_no_clamping) + for size in crop_sizes +] + +plot([[(orig_img, box_hard_clamping)] + hard_center_crops_and_boxes, + [(orig_img, orig_box)] + soft_center_crops_and_boxes, + [(orig_img, box_no_clamping)] + no_clamping_center_crops_and_boxes], + bbox_width=10) + +# %% +# The plot above shows the "hard" clamping mode, "soft" and ``None``, in this +# order. While "soft" and ``None`` result in similar plots, they do not lead to +# the exact same clamped boxes. The non-clamped boxes will show dimensions that are further away from the image: +print("boxes with soft clamping:") +print(soft_center_crops_and_boxes) +print() +print("boxes with no clamping:") +print(no_clamping_center_crops_and_boxes) + +# %% +# +# Setting the clamping mode +# -------------------------- +# +# The ``clamping_mode`` attribute, which determines the clamping strategy that +# is applied to a box, can be set in different ways: +# +# - When constructing the bounding box with its +# :class:`~torchvision.tv_tensors.BoundingBoxes` constructor, as done in the example above. +# - By directly setting the attribute on an existing instance, e.g. ``boxes.clamping_mode = "hard"``. +# - By calling the :class:`~torchvision.transforms.v2.SetClampingMode` transform. +# +# Also, remember that you can always clamp the bounding box manually by +# calling the :meth:`~torchvision.transforms.v2.ClampBoundingBoxes` transform! +# Here's an example illustrating all of these option: + +t = v2.Compose([ + v2.CenterCrop(size=(800,)), # clamps according to the current clamping_mode + # attribute, in this case set by the constructor + v2.SetClampingMode(None), # sets the clamping_mode attribute for future transforms + v2.Pad(padding=3), # clamps according to the current clamping_mode + # i.e. ``None`` + v2.ClampBoundingBoxes(clamping_mode="soft"), # clamps with "soft" mode. +]) + +out_img, out_box = t(orig_img, orig_box) +plot([(orig_img, orig_box), (out_img, out_box)], bbox_width=10) + +# %% diff --git a/gallery/transforms/plot_transforms_getting_started.py b/gallery/transforms/plot_transforms_getting_started.py index 2696a9e57e7..d7fb36a4a77 100644 --- a/gallery/transforms/plot_transforms_getting_started.py +++ b/gallery/transforms/plot_transforms_getting_started.py @@ -79,12 +79,13 @@ # very easy: the v2 transforms are fully compatible with the v1 API, so you # only need to change the import! # -# Detection, Segmentation, Videos +# Videos, boxes, masks, keypoints # ------------------------------- # -# The new Torchvision transforms in the ``torchvision.transforms.v2`` namespace -# support tasks beyond image classification: they can also transform bounding -# boxes, segmentation / detection masks, or videos. +# The Torchvision transforms in the ``torchvision.transforms.v2`` namespace +# support tasks beyond image classification: they can also transform rotated or +# axis-aligned bounding boxes, segmentation / detection masks, videos, and +# keypoints. # # Let's briefly look at a detection example with bounding boxes. @@ -129,8 +130,9 @@ # TVTensors are :class:`torch.Tensor` subclasses. The available TVTensors are # :class:`~torchvision.tv_tensors.Image`, # :class:`~torchvision.tv_tensors.BoundingBoxes`, -# :class:`~torchvision.tv_tensors.Mask`, and -# :class:`~torchvision.tv_tensors.Video`. +# :class:`~torchvision.tv_tensors.Mask`, +# :class:`~torchvision.tv_tensors.Video`, and +# :class:`~torchvision.tv_tensors.KeyPoints`. # # TVTensors look and feel just like regular tensors - they **are** tensors. # Everything that is supported on a plain :class:`torch.Tensor` like ``.sum()`` diff --git a/torchvision/transforms/v2/_meta.py b/torchvision/transforms/v2/_meta.py index 68395b468ba..39f223f0398 100644 --- a/torchvision/transforms/v2/_meta.py +++ b/torchvision/transforms/v2/_meta.py @@ -27,11 +27,10 @@ def transform(self, inpt: tv_tensors.BoundingBoxes, params: dict[str, Any]) -> t class ClampBoundingBoxes(Transform): """Clamp bounding boxes to their corresponding image dimensions. - The clamping is done according to the bounding boxes' ``canvas_size`` meta-data. - Args: - clamping_mode: TODOBB more docs. Default is None which relies on the input box' clamping_mode attribute. - + clamping_mode: Default is "auto" which relies on the input box' + ``clamping_mode`` attribute. Read more in :ref:`clamping_mode_tuto` + for more details on how to use this transform. """ def __init__(self, clamping_mode: Union[CLAMPING_MODE_TYPE, str] = "auto") -> None: @@ -57,7 +56,15 @@ def transform(self, inpt: tv_tensors.KeyPoints, params: dict[str, Any]) -> tv_te class SetClampingMode(Transform): - """TODOBB""" + """Sets the ``clamping_mode`` attribute of the bounding boxes for future transforms. + + + + Args: + clamping_mode: The clamping mode to set. Possible values are: "soft", + "hard", or ``None``. Read more in :ref:`clamping_mode_tuto` for more + details on how to use this transform. + """ def __init__(self, clamping_mode: CLAMPING_MODE_TYPE) -> None: super().__init__() diff --git a/torchvision/tv_tensors/_bounding_boxes.py b/torchvision/tv_tensors/_bounding_boxes.py index e4963192671..7aa3e50458d 100644 --- a/torchvision/tv_tensors/_bounding_boxes.py +++ b/torchvision/tv_tensors/_bounding_boxes.py @@ -16,17 +16,20 @@ class BoundingBoxFormat(Enum): Available formats are: - * ``XYXY`` - * ``XYWH`` - * ``CXCYWH`` - * ``XYWHR``: rotated boxes represented via corner, width and height, x1, y1 - being top left, w, h being width and height. r is rotation angle in + * ``XYXY``: bounding box represented via corners; x1, y1 being top left; + x2, y2 being bottom right. + * ``XYWH``: bounding box represented via corner, width and height; x1, y1 + being top left; w, h being width and height. + * ``CXCYWH``: bounding box represented via centre, width and height; cx, + cy being center of box; w, h being width and height. + * ``XYWHR``: rotated boxes represented via corner, width and height; x1, y1 + being top left; w, h being width and height. r is rotation angle in degrees. - * ``CXCYWHR``: rotated boxes represented via centre, width and height, cx, - cy being center of box, w, h being width and height. r is rotation angle + * ``CXCYWHR``: rotated boxes represented via center, width and height; cx, + cy being center of box; w, h being width and height. r is rotation angle in degrees. - * ``XYXYXYXY``: rotated boxes represented via corners, x1, y1 being top - left, x2, y2 being top right, x3, y3 being bottom right, x4, y4 being + * ``XYXYXYXY``: rotated boxes represented via corners; x1, y1 being top + left; x2, y2 being top right; x3, y3 being bottom right; x4, y4 being bottom left. """ @@ -56,12 +59,17 @@ def is_rotated_bounding_format(format: BoundingBoxFormat | str) -> bool: # This should ideally be a Literal, but torchscript fails. CLAMPING_MODE_TYPE = Optional[str] -# TODOBB All docs. Add any new API to rst files, add tutorial[s]. - class BoundingBoxes(TVTensor): """:class:`torch.Tensor` subclass for bounding boxes with shape ``[N, K]``. + .. note:: + Support for rotated bounding boxes was released in TorchVision 0.23 and + is currently a BETA feature. We don't expect the API to change, but + there may be some rare edge-cases. If you find any issues, please report + them on our bug tracker: + https://github.com/pytorch/vision/issues?q=is:open+is:issue + Where ``N`` is the number of bounding boxes and ``K`` is 4 for unrotated boxes, and 5 or 8 for rotated boxes. @@ -75,7 +83,8 @@ class BoundingBoxes(TVTensor): data: Any data that can be turned into a tensor with :func:`torch.as_tensor`. format (BoundingBoxFormat, str): Format of the bounding box. canvas_size (two-tuple of ints): Height and width of the corresponding image or video. - clamping_mode: TODOBB + clamping_mode: The clamping mode to use when applying transforms that may result in bounding boxes + partially outside of the image. Possible values are: "soft", "hard", or ``None``. Read more in :ref:`clamping_mode_tuto`. dtype (torch.dtype, optional): Desired data type of the bounding box. If omitted, will be inferred from ``data``. device (torch.device, optional): Desired device of the bounding box. If omitted and ``data`` is a diff --git a/torchvision/tv_tensors/_keypoints.py b/torchvision/tv_tensors/_keypoints.py index cb4163be20d..aede31ad7db 100644 --- a/torchvision/tv_tensors/_keypoints.py +++ b/torchvision/tv_tensors/_keypoints.py @@ -11,21 +11,24 @@ class KeyPoints(TVTensor): """:class:`torch.Tensor` subclass for tensors with shape ``[..., 2]`` that represent points in an image. - Each point is represented by its X and Y coordinates along the width and height dimensions, respectively. + .. note:: + Support for keypoints was released in TorchVision 0.23 and is currently + a BETA feature. We don't expect the API to change, but there may be some + rare edge-cases. If you find any issues, please report them on our bug + tracker: https://github.com/pytorch/vision/issues?q=is:open+is:issue + Each point is represented by its X and Y coordinates along the width and + height dimensions, respectively. - KeyPoints can be converted from :class:`torchvision.tv_tensors.BoundingBoxes` - by :func:`torchvision.transforms.v2.functional.convert_bounding_boxes_to_points`. + Each point is represented by its X and Y coordinates along the width and height dimensions, respectively. KeyPoints may represent any object that can be represented by sequences of 2D points: - `Polygonal chains `_, - including polylines, Bézier curves, etc., which should be of shape - ``[N_chains, N_points, 2]``, which is equal to ``[N_chains, N_segments + - 1, 2]`` - - Polygons, which should be of shape ``[N_polygons, N_points, 2]``, which is - equal to ``[N_polygons, N_sides, 2]`` - - Skeletons, which could be of shape ``[N_skeletons, N_bones, 2, 2]`` for - pose-estimation models + including polylines, Bézier curves, etc., which can be of shape + ``[N_chains, N_points, 2]``. + - Polygons, which can be of shape ``[N_polygons, N_points, 2]``. + - Skeletons, which can be of shape ``[N_skeletons, N_bones, 2, 2]`` for + pose-estimation models. .. note:: Like for :class:`torchvision.tv_tensors.BoundingBoxes`, there should