diff --git a/media/coverage_badge.svg b/media/coverage_badge.svg index fd6a4f9..0ec92b7 100644 --- a/media/coverage_badge.svg +++ b/media/coverage_badge.svg @@ -15,7 +15,7 @@ coverage coverage - 22% - 22% + 21% + 21% diff --git a/tests/conftest.py b/tests/conftest.py index 1116f38..e669566 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -20,7 +20,19 @@ def pytest_addoption(parser): ) parser.addoption( "--yolo-version", - choices=["v5", "v6", "v6r2", "v6r4", "v7", "v8", "v9", "v10", "v11", "v12"], + choices=[ + "v5", + "v6", + "v6r2", + "v6r4", + "v7", + "v8", + "v9", + "v10", + "v11", + "v12", + "v26", + ], default=None, help="If set then test only that specific yolo version", ) diff --git a/tests/constants.py b/tests/constants.py index b39825b..82f0ebb 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -69,6 +69,13 @@ {"name": "yolov12m", "version": "v12"}, {"name": "yolov12l", "version": "v12"}, {"name": "yolov12x", "version": "v12"}, + {"name": "yolo26n", "version": "v26"}, + {"name": "yolo26s", "version": "v26"}, + {"name": "yolo26m", "version": "v26"}, + {"name": "yolo26l", "version": "v26"}, + {"name": "yolo26x", "version": "v26"}, + {"name": "yolo26n-seg", "version": "v26"}, + {"name": "yolo26n-pose", "version": "v26"}, {"name": "yolov8n-cls", "version": "v8"}, {"name": "yolov8n-seg", "version": "v8"}, {"name": "yolov8n-pose", "version": "v8"}, @@ -157,6 +164,13 @@ "yolov12m": "https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo12m.pt", "yolov12l": "https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo12l.pt", "yolov12x": "https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo12x.pt", + "yolo26n": "https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26n.pt", + "yolo26s": "https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26s.pt", + "yolo26m": "https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26m.pt", + "yolo26l": "https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26l.pt", + "yolo26x": "https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26x.pt", + "yolo26n-seg": "https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26n-seg.pt", + "yolo26n-pose": "https://github.com/ultralytics/assets/releases/download/v8.4.0/yolo26n-pose.pt", "yolov11n-cls": "https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-cls.pt", "yolov11n-seg": "https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-seg.pt", "yolov11n-pose": "https://github.com/ultralytics/assets/releases/download/v8.3.0/yolo11n-pose.pt", diff --git a/tools/main.py b/tools/main.py index f10c6e6..a8f9717 100644 --- a/tools/main.py +++ b/tools/main.py @@ -27,6 +27,7 @@ YOLOV10_CONVERSION, YOLOV11_CONVERSION, YOLOV12_CONVERSION, + YOLOV26_CONVERSION, detect_version, ) @@ -48,6 +49,7 @@ YOLOV10_CONVERSION, YOLOV11_CONVERSION, YOLOV12_CONVERSION, + YOLOV26_CONVERSION, ] @@ -178,6 +180,10 @@ def convert( from tools.yolo.yolov8_exporter import YoloV8Exporter exporter = YoloV8Exporter(str(model_path), config.imgsz, config.use_rvc2) + elif version == YOLOV26_CONVERSION: + from tools.yolo.yolo26_exporter import Yolo26Exporter + + exporter = Yolo26Exporter(str(model_path), config.imgsz, config.use_rvc2) elif version == YOLOV10_CONVERSION: from tools.yolo.yolov10_exporter import YoloV10Exporter diff --git a/tools/modules/__init__.py b/tools/modules/__init__.py index dc2c8e1..d0de145 100644 --- a/tools/modules/__init__.py +++ b/tools/modules/__init__.py @@ -13,8 +13,11 @@ DetectV7, DetectV8, DetectV10, + DetectV26, PoseV8, + PoseV26, SegmentV8, + SegmentV26, ) from .stage2 import Multiplier @@ -27,11 +30,14 @@ "DetectV8", "Exporter", "PoseV8", + "PoseV26", "OBBV8", "SegmentV8", + "SegmentV26", "ClassifyV8", "Multiplier", "DetectV5", "DetectV7", "DetectV10", + "DetectV26", ] diff --git a/tools/modules/heads.py b/tools/modules/heads.py index 6263ee0..fb5e058 100644 --- a/tools/modules/heads.py +++ b/tools/modules/heads.py @@ -526,3 +526,332 @@ def __init__(self, old_detect, use_rvc2): super().__init__(old_detect, use_rvc2) self.cv2 = old_detect.one2one_cv2 self.cv3 = old_detect.one2one_cv3 + + +class DetectV26(nn.Module): + """YOLOv26 Detect head for end-to-end NMS-free detection models. + + Uses one2one_cv2 and one2one_cv3 weights instead of cv2 and cv3 to enable NMS-free + inference. The one2one heads are trained with tal_topk=1 for one-to-one label + assignment. + """ + + dynamic = False # force grid reconstruction + export = False # export mode + shape = None + anchors = torch.empty(0) # init + strides = torch.empty(0) # init + max_det = 300 # max detections per image + + def __init__( + self, + old_detect, + use_rvc2: bool, + conf_threshold: float = 0.0, + ): + super().__init__() + self.nc = old_detect.nc # number of classes + self.nl = old_detect.nl # number of detection layers + self.reg_max = old_detect.reg_max # DFL channels + self.no = old_detect.no # number of outputs per anchor + self.stride = old_detect.stride # strides computed during build + + # Use one2one heads for NMS-free inference + self.cv2 = old_detect.one2one_cv2 + self.cv3 = old_detect.one2one_cv3 + self.f = old_detect.f + self.i = old_detect.i + + self.use_rvc2 = use_rvc2 + self.conf_threshold = conf_threshold + + def forward(self, x): + bs = x[0].shape[0] # batch size + + boxes = [] + scores = [] + for i in range(self.nl): + box = self.cv2[i](x[i]) + + cls_regress = self.cv3[i](x[i]) + boxes.append(box.view(bs, 4, -1)) + scores.append(cls_regress.view(bs, self.nc, -1)) + + preds = { + "boxes": torch.cat(boxes, dim=2), + "scores": torch.cat(scores, dim=2), + "feats": x, + } + + dbox = self._get_decode_boxes(preds) + y = torch.cat((dbox, preds["scores"].sigmoid()), 1) # (bs, 4+nc, num_anchors) + y = y.permute(0, 2, 1) # (bs, num_anchors, 4+nc) + return y + + def _get_decode_boxes(self, preds): + # Emulate ultralytics.nn.modules.head.Detect._get_decode_boxes for end2end export. + # preds["boxes"]: (N, 4, A), preds["feats"]: list of feature maps (N, C, H_i, W_i) + shape = preds["feats"][0].shape # BCHW + if self.dynamic or self.shape != shape: + anchor_points, stride_tensor = self._make_anchors( + preds["feats"], self.stride, 0.5 + ) + self.anchors = anchor_points.transpose(0, 1) + self.strides = stride_tensor.transpose(0, 1) + self.shape = shape + + # anchors: (1, 2, A), strides: (1, 1, A) + # returns: decoded boxes (N, 4, A) in xyxy pixels + dbox = self.dist2bbox( + preds["boxes"], self.anchors.unsqueeze(0), xywh=False, dim=1 + ) + return dbox * self.strides + + @staticmethod + def _make_anchors(feats, strides, grid_cell_offset=0.5): + # Emulate ultralytics.utils.tal.make_anchors. + # feats: list of (N, C, H_i, W_i) -> returns anchor_points (A, 2), stride_tensor (A, 1) + anchor_points, stride_tensor = [], [] + dtype, device = feats[0].dtype, feats[0].device + for i, stride in enumerate(strides): + h, w = feats[i].shape[2:] + sx = torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset + sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset + sy, sx = torch.meshgrid(sy, sx, indexing="ij") + anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2)) + stride_tensor.append( + torch.full((h * w, 1), stride, dtype=dtype, device=device) + ) + return torch.cat(anchor_points), torch.cat(stride_tensor) + + @staticmethod + def dist2bbox(distance, anchor_points, xywh=True, dim=-1): + # Emulate ultralytics.utils.tal.dist2bbox. + # distance: (N, 4, A) if dim=1, anchor_points: (1, 2, A) -> returns (N, 4, A) + # xywh=True outputs (cx, cy, w, h); xywh=False outputs (x1, y1, x2, y2) + lt, rb = distance.chunk(2, dim) + x1y1 = anchor_points - lt + x2y2 = anchor_points + rb + if xywh: + c_xy = (x1y1 + x2y2) / 2 + wh = x2y2 - x1y1 + return torch.cat([c_xy, wh], dim) + return torch.cat((x1y1, x2y2), dim) + + +class SegmentV26(DetectV26): + """YOLOv26 Segment head for end-to-end NMS-free instance segmentation models. + + Outputs decoded boxes, class scores, mask coefficients (separate), and prototype masks. + + Output format: + - detections: (batch, num_anchors, 4 + nc) + - 4: decoded bbox coordinates (x1, y1, x2, y2) in pixel space + - nc: class scores (sigmoided) + - mask_coeffs: (batch, num_anchors, nm) + - nm: mask coefficients (raw, to be used with protos) + - protos: (batch, nm, proto_h, proto_w) + - Prototype masks + + To get final instance masks (K = number of kept detections): + mask = sigmoid(mask_coefficients @ protos.flatten(2)) # (K, H*W) + mask = mask.view(K, proto_h, proto_w) # (K, proto_h, proto_w) + mask = crop_to_bbox(mask, bbox) # crop to detection bbox + """ + + def __init__( + self, + old_segment, + use_rvc2: bool, + conf_threshold: float = 0.0, + ): + super().__init__(old_segment, use_rvc2, conf_threshold) + self.nm = old_segment.nm # number of masks (default 32) + self.npr = old_segment.npr # number of protos (default 256) + self.proto = old_segment.proto # Proto26 module + + # Use one2one mask coefficient heads for NMS-free inference + self.cv4 = old_segment.one2one_cv4 + + def forward(self, x): + """Forward pass returning detections, mask coefficients, and prototype masks. + + Args: + x: List of feature maps from backbone [P3, P4, P5] + + Returns: + Tuple of: + - detections: (batch, num_anchors, 4 + nc) + - mask_coeffs: (batch, num_anchors, nm) + - protos: (batch, nm, proto_h, proto_w) + """ + bs = x[0].shape[0] # batch size + + boxes = [] + scores = [] + mask_coeffs = [] + for i in range(self.nl): + # Box regression + box = self.cv2[i](x[i]) + boxes.append(box.view(bs, 4, -1)) + + # Class scores + cls_regress = self.cv3[i](x[i]) + scores.append(cls_regress.view(bs, self.nc, -1)) + + # Mask coefficients + mask = self.cv4[i](x[i]) + mask_coeffs.append(mask.view(bs, self.nm, -1)) + + preds = { + "boxes": torch.cat(boxes, dim=2), + "scores": torch.cat(scores, dim=2), + "feats": x, + } + + # Decode boxes to pixel coordinates + dbox = self._get_decode_boxes(preds) + + # Detection output: boxes (4) + class scores (nc) + y = torch.cat((dbox, preds["scores"].sigmoid()), 1) # (bs, 4+nc, num_anchors) + y = y.permute(0, 2, 1) # (bs, num_anchors, 4+nc) + + # Mask coefficients output (separate) + mask_coeffs_cat = torch.cat(mask_coeffs, dim=2) # (bs, nm, num_anchors) + mask_coeffs_cat = mask_coeffs_cat.permute(0, 2, 1) # (bs, num_anchors, nm) + + # Get prototype masks + proto = self._get_proto(x) + + return y, mask_coeffs_cat, proto + + def _get_proto(self, x): + """Get prototype masks from Proto26 module. + + Emulate ultralytics.nn.modules.head.Segment26.forward for proto generation. + + Proto26 takes all feature maps and returns prototype masks. + """ + return self.proto(x, return_semseg=False) + + +class PoseV26(DetectV26): + """YOLOv26 Pose head for end-to-end NMS-free pose estimation models. + + Outputs decoded boxes, class scores, and decoded keypoints (separate). + + Output format: + - detections: (batch, num_anchors, 4 + nc) + - 4: decoded bbox coordinates (x1, y1, x2, y2) in pixel space + - nc: class scores (sigmoided) + - keypoints: (batch, num_anchors, nk) + - nk: keypoint values (x, y, [visibility]) for each keypoint + - x, y are in pixel coordinates + - visibility is sigmoided (if ndim == 3) + """ + + def __init__( + self, + old_pose, + use_rvc2: bool, + conf_threshold: float = 0.0, + ): + super().__init__(old_pose, use_rvc2, conf_threshold) + self.kpt_shape = old_pose.kpt_shape # (num_keypoints, ndim) e.g., (17, 3) + self.nk = old_pose.nk # total keypoint values = num_keypoints * ndim + + # Pose26: cv4 is feature extractor, cv4_kpts produces keypoints + self.cv4 = old_pose.one2one_cv4 + self.cv4_kpts = old_pose.one2one_cv4_kpts + + def forward(self, x): + """Forward pass returning detections and decoded keypoints. + + Emulate ultralytics.nn.modules.head.Pose26.forward_head. + + Args: + x: List of feature maps from backbone [P3, P4, P5] + + Returns: + Tuple of: + - detections: (batch, num_anchors, 4 + nc) + - keypoints: (batch, num_anchors, nk) + """ + bs = x[0].shape[0] # batch size + + boxes = [] + scores = [] + kpts_raw = [] + for i in range(self.nl): + # Box regression + box = self.cv2[i](x[i]) + boxes.append(box.view(bs, 4, -1)) + + # Class scores + cls_regress = self.cv3[i](x[i]) + scores.append(cls_regress.view(bs, self.nc, -1)) + + # Keypoints: cv4 extracts features, cv4_kpts predicts keypoints + feat = self.cv4[i](x[i]) + kpt = self.cv4_kpts[i](feat) + kpts_raw.append(kpt.view(bs, self.nk, -1)) + + preds = { + "boxes": torch.cat(boxes, dim=2), + "scores": torch.cat(scores, dim=2), + "feats": x, + } + + # Decode boxes to pixel coordinates (this also sets self.anchors and self.strides) + # from the parent DetectV26 + dbox = self._get_decode_boxes(preds) + + # Detection output: boxes (4) + class scores (nc) + y = torch.cat((dbox, preds["scores"].sigmoid()), 1) # (bs, 4+nc, num_anchors) + y = y.permute(0, 2, 1) # (bs, num_anchors, 4+nc) + + # Decode and concatenate keypoints + # Note: After _get_decode_boxes, self.anchors is (2, A) and self.strides is (1, A) + kpts_cat = torch.cat(kpts_raw, dim=2) # (bs, nk, num_anchors) + kpts_decoded = self._kpts_decode(bs, kpts_cat) # (bs, nk, num_anchors) + kpts_decoded = kpts_decoded.permute(0, 2, 1) # (bs, num_anchors, nk) + + return y, kpts_decoded + + def _kpts_decode(self, bs, kpts): + """Decode keypoints from raw predictions to pixel coordinates. + + Emulate ultralytics.nn.modules.head.Pose26.kpts_decode. + + Args: + bs: Batch size + kpts: Raw keypoint predictions (bs, nk, num_anchors) + + Returns: + Decoded keypoints (bs, nk, num_anchors) with x, y in pixel coords + """ + ndim = self.kpt_shape[1] + num_kpts = self.kpt_shape[0] + num_anchors = kpts.shape[2] + + # Reshape to (bs, num_keypoints, ndim, num_anchors) + y = kpts.view(bs, num_kpts, ndim, num_anchors) + + # After _get_decode_boxes, anchors and strides are already in the right format: + # self.anchors: (2, num_anchors), self.strides: (1, num_anchors) + # Reshape for broadcasting with y[:, :, :2, :] which is (bs, num_kpts, 2, num_anchors) + anchors_reshaped = self.anchors.view(1, 1, 2, num_anchors) # (1, 1, 2, A) + strides_reshaped = self.strides.view(1, 1, 1, num_anchors) # (1, 1, 1, A) + + # Decode xy: (raw + anchor) * stride + xy = (y[:, :, :2, :] + anchors_reshaped) * strides_reshaped + + if ndim == 3: + # Visibility score (sigmoid) + vis = y[:, :, 2:3, :].sigmoid() + decoded = torch.cat((xy, vis), dim=2) + else: + decoded = xy + + # Reshape back to (bs, nk, num_anchors) + return decoded.view(bs, self.nk, num_anchors) diff --git a/tools/version_detection/__init__.py b/tools/version_detection/__init__.py index 11e3815..0d097e4 100644 --- a/tools/version_detection/__init__.py +++ b/tools/version_detection/__init__.py @@ -12,6 +12,7 @@ YOLOV10_CONVERSION, YOLOV11_CONVERSION, YOLOV12_CONVERSION, + YOLOV26_CONVERSION, detect_version, ) @@ -28,6 +29,7 @@ "YOLOV10_CONVERSION", "YOLOV11_CONVERSION", "YOLOV12_CONVERSION", + "YOLOV26_CONVERSION", "GOLD_YOLO_CONVERSION", "UNRECOGNIZED", ] diff --git a/tools/version_detection/version_detection.py b/tools/version_detection/version_detection.py index cc93a44..5ad5d82 100644 --- a/tools/version_detection/version_detection.py +++ b/tools/version_detection/version_detection.py @@ -17,6 +17,7 @@ YOLOV10_CONVERSION = "yolov10" YOLOV11_CONVERSION = "yolov11" YOLOV12_CONVERSION = "yolov12" +YOLOV26_CONVERSION = "yolov26" GOLD_YOLO_CONVERSION = "goldyolo" UNRECOGNIZED = "none" @@ -74,7 +75,9 @@ def detect_version(path: str, debug: bool = False) -> str: if debug: print(data.decode(errors="replace")) content = data.decode("latin1") - if "yolov12" in content: + if "yolo26" in content: + return YOLOV26_CONVERSION + elif "yolov12" in content: return YOLOV12_CONVERSION elif "yolo11" in content: return YOLOV11_CONVERSION diff --git a/tools/yolo/ultralytics b/tools/yolo/ultralytics index 2107aa1..0537be1 160000 --- a/tools/yolo/ultralytics +++ b/tools/yolo/ultralytics @@ -1 +1 @@ -Subproject commit 2107aa1eb9e95d93645a9febbda4a3fc75a57f21 +Subproject commit 0537be116924fef9ec3a66e4689134a6a59e7dce diff --git a/tools/yolo/yolo26_exporter.py b/tools/yolo/yolo26_exporter.py new file mode 100644 index 0000000..0284697 --- /dev/null +++ b/tools/yolo/yolo26_exporter.py @@ -0,0 +1,148 @@ +from __future__ import annotations + +import os +import sys +from typing import List, Optional, Tuple + +from loguru import logger + +from tools.modules import DetectV26, Exporter, PoseV26, SegmentV26 +from tools.utils import get_first_conv2d_in_channels +from tools.utils.constants import Encoding + +current_dir = os.path.dirname(os.path.abspath(__file__)) +yolo_path = os.path.join(current_dir, "ultralytics") +sys.path.append(yolo_path) + +from ultralytics.nn.modules import ( # noqa: E402 + Detect, + Pose26, + Segment26, +) +from ultralytics.nn.tasks import load_checkpoint # noqa: E402 + +DETECT_MODE = 0 +SEGMENT_MODE = 1 +OBB_MODE = 2 +CLASSIFY_MODE = 3 +POSE_MODE = 4 + + +def get_output_names(mode: int): + if mode == DETECT_MODE: + return ["output"] + elif mode == SEGMENT_MODE: + return ["output", "mask_output", "protos_output"] + elif mode == POSE_MODE: + return ["output", "kpt_output"] + else: + logger.warning("Unsupported task type for YOLO26, conversion may fail") + return ["output"] + + +def get_yolo_output_names(mode: int = 0): + # For now, yolo output names doesn't differ based on mode because we no longer extract 3 outputs from FPN + return ["output"] + + +class Yolo26Exporter(Exporter): + def __init__(self, model_path: str, imgsz: Tuple[int, int], use_rvc2: bool): + super().__init__( + model_path, + imgsz, + use_rvc2, + subtype="yolo26", + output_names=["output"], + ) + self.load_model() + + def load_model(self): + model, _ = load_checkpoint( + self.model_path, device="cpu", inplace=True, fuse=True + ) + + head = model.model[-1] + self.mode = -1 + if isinstance(head, (Segment26)): + model.model[-1] = SegmentV26(model.model[-1], self.use_rvc2) + self.mode = SEGMENT_MODE + elif isinstance(head, Pose26): + model.model[-1] = PoseV26(model.model[-1], self.use_rvc2) + self.mode = POSE_MODE + elif isinstance(head, Detect): + model.model[-1] = DetectV26(head, self.use_rvc2) + self.mode = DETECT_MODE + + if self.mode in [DETECT_MODE, SEGMENT_MODE, POSE_MODE]: + self.names = ( + model.module.names if hasattr(model, "module") else model.names + ) # get class names + # check num classes and labels + assert model.model[-1].nc == len(self.names), ( + f"Model class count {model.nc} != len(names) {len(self.names)}" + ) + + try: + self.number_of_channels = get_first_conv2d_in_channels(model) + except Exception as e: + logger.error(f"Error while getting number of channels: {e}") + + self.all_output_names = get_output_names(self.mode) + self.output_names = get_yolo_output_names(self.mode) + + gs = max(int(model.stride.max()), 32) # model stride + if isinstance(self.imgsz, int): + self.imgsz = [self.imgsz, self.imgsz] + for sz in self.imgsz: + if sz % gs != 0: + raise ValueError(f"Image size is not a multiple of maximum stride {gs}") + + # ensure correct length + if len(self.imgsz) != 2: + raise ValueError("Image size must be of length 1 or 2.") + + model.eval() + self.model = model + + def export_nn_archive( + self, class_names: Optional[List[str]] = None, encoding: Encoding = Encoding.RGB + ): + names = list(self.model.names.values()) + + if class_names is not None: + assert len(class_names) == len(names), ( + f"Number of the given class names {len(class_names)} does not match number of classes {len(names)} provided in the model!" + ) + names = class_names + + self.f_nn_archive = (self.output_folder / f"{self.model_name}.tar.xz").resolve() + + if self.mode == DETECT_MODE: + self.make_nn_archive( + class_list=names, + n_classes=self.model.model[-1].nc, + parser="YOLOExtendedParser", + encoding=encoding, + ) + elif self.mode == SEGMENT_MODE: + self.make_nn_archive( + class_list=names, + n_classes=self.model.model[-1].nc, + parser="YOLOExtendedParser", + n_prototypes=self.model.model[-1].nm, + is_softmax=False, # E2E outputs are already sigmoided + output_kwargs={ + "mask_outputs": ["mask_output"], + "protos_outputs": "protos_output", + }, + encoding=encoding, + ) + elif self.mode == POSE_MODE: + self.make_nn_archive( + class_list=names, + n_classes=self.model.model[-1].nc, + parser="YOLOExtendedParser", + n_keypoints=self.model.model[-1].kpt_shape[0], + output_kwargs={"keypoints_outputs": ["kpt_output"]}, + encoding=encoding, + )