datajuicer
diff --git a/‎data_juicer/ops/mapper/__init__.py‎
Lines changed: 4 additions & 0 deletions b/‎data_juicer/ops/mapper/__init__.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎data_juicer/ops/mapper/video_depth_estimation_mapper.py‎
Lines changed: 143 additions & 0 deletions b/‎data_juicer/ops/mapper/video_depth_estimation_mapper.py‎
Lines changed: 143 additions & 0 deletions
diff --git a/‎data_juicer/ops/mapper/video_object_segmenting_mapper.py‎
Lines changed: 210 additions & 0 deletions b/‎data_juicer/ops/mapper/video_object_segmenting_mapper.py‎
Lines changed: 210 additions & 0 deletions
diff --git a/‎data_juicer/utils/constant.py‎
Lines changed: 4 additions & 0 deletions b/‎data_juicer/utils/constant.py‎
Lines changed: 4 additions & 0 deletions
@@ -79,9 +79,11 @@
 from .video_captioning_from_frames_mapper import VideoCaptioningFromFramesMapper
 from .video_captioning_from_summarizer_mapper import VideoCaptioningFromSummarizerMapper
 from .video_captioning_from_video_mapper import VideoCaptioningFromVideoMapper
+from .video_depth_estimation_mapper import VideoDepthEstimationMapper
 from .video_extract_frames_mapper import VideoExtractFramesMapper
 from .video_face_blur_mapper import VideoFaceBlurMapper
 from .video_ffmpeg_wrapped_mapper import VideoFFmpegWrappedMapper
+from .video_object_segmenting_mapper import VideoObjectSegmentingMapper
 from .video_remove_watermark_mapper import VideoRemoveWatermarkMapper
 from .video_resize_aspect_ratio_mapper import VideoResizeAspectRatioMapper
 from .video_resize_resolution_mapper import VideoResizeResolutionMapper
@@ -168,9 +170,11 @@
     "VideoCaptioningFromFramesMapper",
     "VideoCaptioningFromSummarizerMapper",
     "VideoCaptioningFromVideoMapper",
+    "VideoDepthEstimationMapper",
     "VideoExtractFramesMapper",
     "VideoFFmpegWrappedMapper",
     "VideoFaceBlurMapper",
+    "VideoObjectSegmentingMapper",
     "VideoRemoveWatermarkMapper",
     "VideoResizeAspectRatioMapper",
     "VideoResizeResolutionMapper",
 
@@ -0,0 +1,143 @@
+import os
+
+import numpy as np
+
+from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
+from data_juicer.utils.constant import Fields, MetaKeys
+from data_juicer.utils.lazy_loader import LazyLoader
+from data_juicer.utils.model_utils import get_model, prepare_model
+
+from ..base_op import OPERATORS, TAGGING_OPS, UNFORKABLE, Mapper
+from ..op_fusion import LOADED_VIDEOS
+
+OP_NAME = "video_depth_estimation_mapper"
+
+cv2 = LazyLoader("cv2", "opencv-python")
+torch = LazyLoader("torch")
+# video_depth_anything = LazyLoader("video_depth_anything", "git+https://github.com/DepthAnything/Video-Depth-Anything.git", pip_args=['--no-deps'])
+open3d = LazyLoader("open3d")
+
+
+@TAGGING_OPS.register_module(OP_NAME)
+@UNFORKABLE.register_module(OP_NAME)
+@OPERATORS.register_module(OP_NAME)
+@LOADED_VIDEOS.register_module(OP_NAME)
+class VideoDepthEstimationMapper(Mapper):
+    """Perform depth estimation on the video."""
+
+    _accelerator = "cuda"
+
+    def __init__(
+        self,
+        video_depth_model_path: str = "video_depth_anything_vitb.pth",
+        point_cloud_dir_for_metric: str = DATA_JUICER_ASSETS_CACHE,
+        max_res: int = 1280,
+        torch_dtype: str = "fp16",
+        if_save_visualization: bool = False,
+        save_visualization_dir: str = DATA_JUICER_ASSETS_CACHE,
+        grayscale: bool = False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialization method.
+
+        :param video_depth_model_path: The path to the Video-Depth-Anything model.
+            If the model is a 'metric' model, the code will automatically switch
+            to metric mode, and the user should input the path for storing point
+            clouds.
+        :param point_cloud_dir_for_metric: The path for storing point
+            clouds (for a 'metric' model).
+        :param max_res: The maximum resolution threshold for videos; videos exceeding
+            this threshold will be resized.
+        :param torch_dtype: The floating point type used for model inference. Can be
+            one of ['fp32', 'fp16']
+        :param if_save_visualization: Whether to save visualization results.
+        :param save_visualization_dir: The path for saving visualization results.
+        :param grayscale: If True, the colorful palette will not be applied.
+
+        """
+
+        super().__init__(*args, **kwargs)
+
+        video_depth_anything_repo_path = os.path.join(DATA_JUICER_ASSETS_CACHE, "Video-Depth-Anything")
+        if not os.path.exists(video_depth_anything_repo_path):
+            os.system(
+                f"git clone https://github.com/DepthAnything/Video-Depth-Anything.git {video_depth_anything_repo_path}"
+            )
+        import sys
+
+        sys.path.append(os.path.join(video_depth_anything_repo_path))
+        from utils.dc_utils import read_video_frames, save_video
+
+        if "metric" in video_depth_model_path:
+            self.metric = True
+        else:
+            self.metric = False
+
+        self.read_video_frames = read_video_frames
+        self.save_video = save_video
+
+        self.tag_field_name = MetaKeys.video_depth_tags
+        self.max_res = max_res
+        self.torch_dtype = torch_dtype
+        self.point_cloud_dir_for_metric = point_cloud_dir_for_metric
+        self.if_save_visualization = if_save_visualization
+        self.save_visualization_dir = save_visualization_dir
+        self.grayscale = grayscale
+        self.model_key = prepare_model(model_type="video_depth_anything", model_path=video_depth_model_path)
+
+    def process_single(self, sample=None, rank=None):
+        # check if it's generated already
+        if self.tag_field_name in sample[Fields.meta]:
+            return sample
+
+        # there is no video in this sample
+        if self.video_key not in sample or not sample[self.video_key]:
+            sample[Fields.meta][self.tag_field_name] = {"depth_data": [], "fps": -1}
+            return sample
+
+        video_depth_anything_model = get_model(model_key=self.model_key, rank=rank, use_cuda=self.use_cuda())
+
+        frames, target_fps = self.read_video_frames(sample[self.video_key][0], -1, -1, self.max_res)
+        depths, fps = video_depth_anything_model.infer_video_depth(
+            frames,
+            target_fps,
+            input_size=518,
+            device="cuda" if self.use_cuda() else "cpu",
+            fp32=False if self.torch_dtype == "fp16" else True,
+        )
+
+        if self.if_save_visualization:
+            video_name = os.path.basename(sample[self.video_key][0])
+            os.makedirs(self.save_visualization_dir, exist_ok=True)
+            processed_video_path = os.path.join(
+                self.save_visualization_dir, os.path.splitext(video_name)[0] + "_src.mp4"
+            )
+            depth_vis_path = os.path.join(self.save_visualization_dir, os.path.splitext(video_name)[0] + "_vis.mp4")
+            self.save_video(frames, processed_video_path, fps=fps)
+            self.save_video(depths, depth_vis_path, fps=fps, is_depths=True, grayscale=self.grayscale)
+
+        if self.metric:
+            width, height = depths[0].shape[-1], depths[0].shape[-2]
+            x, y = np.meshgrid(np.arange(width), np.arange(height))
+            x = (x - width / 2) / 470.4
+            y = (y - height / 2) / 470.4
+
+            for i, (color_image, depth) in enumerate(zip(frames, depths)):
+                z = np.array(depth)
+                points = np.stack((np.multiply(x, z), np.multiply(y, z), z), axis=-1).reshape(-1, 3)
+                colors = np.array(color_image).reshape(-1, 3) / 255.0
+
+                pcd = open3d.geometry.PointCloud()
+                pcd.points = open3d.utility.Vector3dVector(points)
+                pcd.colors = open3d.utility.Vector3dVector(colors)
+                open3d.io.write_point_cloud(
+                    os.path.join(self.point_cloud_dir_for_metric, "point" + str(i).zfill(4) + ".ply"), pcd
+                )
+
+        sample[Fields.meta][self.tag_field_name] = {}
+        sample[Fields.meta][self.tag_field_name]["depth_data"] = depths
+        sample[Fields.meta][self.tag_field_name]["fps"] = fps
+
+        return sample
@@ -0,0 +1,210 @@
+import os
+import random
+from datetime import datetime
+
+import numpy as np
+
+from data_juicer.utils.cache_utils import DATA_JUICER_ASSETS_CACHE
+from data_juicer.utils.constant import Fields, MetaKeys
+from data_juicer.utils.lazy_loader import LazyLoader
+from data_juicer.utils.model_utils import check_model, get_model, prepare_model
+
+from ..base_op import OPERATORS, TAGGING_OPS, UNFORKABLE, Mapper
+from ..op_fusion import LOADED_VIDEOS
+
+OP_NAME = "video_object_segmenting_mapper"
+
+cv2 = LazyLoader("cv2", "opencv-python")
+ultralytics = LazyLoader("ultralytics")
+torch = LazyLoader("torch")
+transformers = LazyLoader("transformers")
+
+
+@TAGGING_OPS.register_module(OP_NAME)
+@UNFORKABLE.register_module(OP_NAME)
+@OPERATORS.register_module(OP_NAME)
+@LOADED_VIDEOS.register_module(OP_NAME)
+class VideoObjectSegmentingMapper(Mapper):
+    """Text-guided semantic segmentation of valid objects throughout the video (YOLOE + SAM2)."""
+
+    _accelerator = "cuda"
+
+    def __init__(
+        self,
+        sam2_hf_model: str = "facebook/sam2.1-hiera-tiny",
+        yoloe_path: str = "yoloe-11l-seg.pt",
+        yoloe_conf: float = 0.5,
+        torch_dtype: str = "bf16",
+        if_binarize: bool = True,
+        if_save_visualization: bool = False,
+        save_visualization_dir: str = DATA_JUICER_ASSETS_CACHE,
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialization method.
+
+        :param hf_model: Hugginface model id of SAM2.
+        :param yoloe_path: The path to the YOLOE model.
+        :param yoloe_conf: Confidence threshold for YOLOE object detection.
+        :param torch_dtype: The floating point type used for model inference. Can
+            be one of ['fp32', 'fp16', 'bf16'].
+        :param if_binarize: Whether the final mask requires binarization.
+            If 'if_save_visualization' is set to True, 'if_binarize' will
+            automatically be adjusted to True.
+        :param if_save_visualization: Whether to save visualization results.
+        :param save_visualization_dir: The path for saving visualization results.
+
+        """
+
+        super().__init__(*args, **kwargs)
+
+        # Requires the weights for YOLOE and mobileclip_blt.
+        self.yoloe_model = ultralytics.YOLO(check_model(yoloe_path))
+        torch_dtype_dict = {"fp32": torch.float32, "fp16": torch.float16, "bf16": torch.bfloat16}
+        self.torch_dtype = torch_dtype_dict[torch_dtype]
+        self.sam2_model_key = prepare_model(
+            model_type="huggingface", torch_dtype=self.torch_dtype, pretrained_model_name_or_path=sam2_hf_model
+        )
+
+        self.tag_field_name = MetaKeys.video_object_segment_tags
+        self.yoloe_conf = yoloe_conf
+        self.if_save_visualization = if_save_visualization
+        self.save_visualization_dir = save_visualization_dir
+        self.if_binarize = True if if_save_visualization else if_binarize
+
+    def process_single(self, sample=None, rank=None):
+        # check if it's generated already
+        if self.tag_field_name in sample[Fields.meta]:
+            return sample
+
+        # there is no video in this sample
+        if self.video_key not in sample or not sample[self.video_key]:
+            sample[Fields.meta][self.tag_field_name] = {
+                "segment_data": [],
+                "cls_id_dict": [],
+                "object_cls_list": [],
+                "yoloe_conf_list": [],
+            }
+            return sample
+
+        sam2_model, sam2_processor = get_model(model_key=self.sam2_model_key, rank=rank, use_cuda=self.use_cuda())
+
+        # Perform semantic segmentation on the first frame using YOLOE
+        videoCapture = cv2.VideoCapture(sample[self.video_key][0])
+        success, initial_frame = videoCapture.read()
+        random_num_str = str(random.randint(10000, 99999))
+        now_time_str = str(datetime.now())
+        if success:
+            if not os.path.exists(DATA_JUICER_ASSETS_CACHE):
+                os.makedirs(DATA_JUICER_ASSETS_CACHE, exist_ok=True)
+
+            temp_video_name = sample[self.video_key][0].split("/")[-1].replace(".mp4", "")
+            temp_initial_frame_path = os.path.join(
+                DATA_JUICER_ASSETS_CACHE,
+                f"{temp_video_name}_initial_frame_{now_time_str}_{random_num_str}.jpg",
+            )
+            cv2.imwrite(temp_initial_frame_path, initial_frame)
+        else:
+            # Failed to load initial frame
+            sample[Fields.meta][self.tag_field_name] = {
+                "segment_data": [],
+                "cls_id_dict": [],
+                "object_cls_list": [],
+                "yoloe_conf_list": [],
+            }
+            return sample
+
+        self.yoloe_model.set_classes(
+            sample["main_character_list"], self.yoloe_model.get_text_pe(sample["main_character_list"])
+        )
+        results = self.yoloe_model.predict(temp_initial_frame_path, verbose=False, conf=self.yoloe_conf)
+        yoloe_bboxes = results[0].boxes.xyxy.tolist()
+        bboxes_cls = results[0].boxes.cls.tolist()
+        bboxes_cls = [int(x) for x in bboxes_cls]
+        cls_id_dict = results[0].names
+        yoloe_conf_list = results[0].boxes.conf.tolist()
+
+        obj_ids = []
+        object_cls_list = []
+        input_boxes = []
+        for temp_cls, temp_box in zip(bboxes_cls, yoloe_bboxes):
+            obj_ids.append(len(obj_ids))
+            object_cls_list.append(temp_cls)
+            input_boxes.append([int(x) for x in temp_box])
+
+        input_boxes = [input_boxes]
+        os.remove(temp_initial_frame_path)
+
+        if len(obj_ids) == 0:
+            sample[Fields.meta][self.tag_field_name] = {
+                "segment_data": [],
+                "cls_id_dict": [],
+                "object_cls_list": [],
+                "yoloe_conf_list": [],
+            }
+            return sample
+
+        # Track objects with SAM2
+        video_frames, _ = transformers.video_utils.load_video(sample[self.video_key][0])
+
+        inference_session = sam2_processor.init_video_session(
+            video=video_frames,
+            inference_device="cuda" if self.use_cuda() else "cpu",
+            dtype=self.torch_dtype,
+        )
+
+        ann_frame_idx = 0
+        sam2_processor.add_inputs_to_inference_session(
+            inference_session=inference_session,
+            frame_idx=ann_frame_idx,
+            obj_ids=obj_ids,
+            input_boxes=input_boxes,
+        )
+
+        # Get masks for all objects on the first frame
+        outputs = sam2_model(
+            inference_session=inference_session,
+            frame_idx=ann_frame_idx,
+        )
+        video_res_masks = sam2_processor.post_process_masks(
+            [outputs.pred_masks],
+            original_sizes=[[inference_session.video_height, inference_session.video_width]],
+            binarize=False,
+        )[0]
+
+        #  Propagate all objects through the video
+        video_segments = []
+        for sam2_video_output in sam2_model.propagate_in_video_iterator(inference_session):
+            video_res_masks = sam2_processor.post_process_masks(
+                [sam2_video_output.pred_masks],
+                original_sizes=[[inference_session.video_height, inference_session.video_width]],
+                binarize=self.if_binarize,
+            )[0]
+            video_segments.append([video_res_masks[i].tolist() for i, obj_id in enumerate(inference_session.obj_ids)])
+
+        sample[Fields.meta][self.tag_field_name] = {}
+        sample[Fields.meta][self.tag_field_name]["segment_data"] = video_segments
+        sample[Fields.meta][self.tag_field_name]["cls_id_dict"] = [cls_id_dict[key] for key in cls_id_dict]
+        sample[Fields.meta][self.tag_field_name]["object_cls_list"] = object_cls_list
+        sample[Fields.meta][self.tag_field_name]["yoloe_conf_list"] = yoloe_conf_list
+
+        if self.if_save_visualization:
+            if not os.path.exists(self.save_visualization_dir):
+                os.makedirs(self.save_visualization_dir, exist_ok=True)
+
+            for temp_frame_masks_id, temp_frame_masks in enumerate(
+                sample[Fields.meta][self.tag_field_name]["segment_data"]
+            ):
+                for temp_obj_id, temp_mask in enumerate(temp_frame_masks):
+                    temp_img = np.zeros((initial_frame.shape[0], initial_frame.shape[1], 3), np.uint8)
+                    temp_mask = np.squeeze(np.array(temp_mask))
+                    temp_img[temp_mask] = [225, 225, 225]
+
+                    temp_mask_path = os.path.join(
+                        self.save_visualization_dir,
+                        f"{temp_video_name}_mask_{str(temp_obj_id)}_{str(temp_frame_masks_id)}_{now_time_str}_{random_num_str}.jpg",
+                    )
+                    cv2.imwrite(temp_mask_path, temp_img)
+
+        return sample
@@ -60,6 +60,10 @@ class MetaKeys(object):
     video_audio_tags = "video_audio_tags"
     # # video frames
     video_frames = "video_frames"
+    # # object segment info in video
+    video_object_segment_tags = "video_object_segment_tags"
+    # # depth info in video
+    video_depth_tags = "video_depth_tags"
     # # image tags
     image_tags = "image_tags"
     # # bounding box tag