huggingface
diff --git a/‎setup.py‎
Lines changed: 2 additions & 0 deletions b/‎setup.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/diffusers/dependency_versions_table.py‎
Lines changed: 1 addition & 0 deletions b/‎src/diffusers/dependency_versions_table.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/diffusers/models/modeling_utils.py‎
Lines changed: 149 additions & 72 deletions b/‎src/diffusers/models/modeling_utils.py‎
Lines changed: 149 additions & 72 deletions
diff --git a/‎src/diffusers/pipelines/pipeline_loading_utils.py‎
Lines changed: 14 additions & 1 deletion b/‎src/diffusers/pipelines/pipeline_loading_utils.py‎
Lines changed: 14 additions & 1 deletion
@@ -146,6 +146,7 @@
     "phonemizer",
     "opencv-python",
     "timm",
+    "flashpack",
 ]
 
 # this is a lookup table with items like:
@@ -250,6 +251,7 @@ def run(self):
 extras["optimum_quanto"] = deps_list("optimum_quanto", "accelerate")
 extras["torchao"] = deps_list("torchao", "accelerate")
 extras["nvidia_modelopt"] = deps_list("nvidia_modelopt[hf]")
+extras["flashpack"] = deps_list("flashpack")
 
 if os.name == "nt":  # windows
     extras["flax"] = []  # jax is not supported on windows
 
@@ -53,4 +53,5 @@
     "phonemizer": "phonemizer",
     "opencv-python": "opencv-python",
     "timm": "timm",
+    "flashpack": "flashpack",
 }
@@ -42,6 +42,7 @@
 from ..quantizers.quantization_config import QuantizationMethod
 from ..utils import (
     CONFIG_NAME,
+    FLASHPACK_WEIGHTS_NAME,
     FLAX_WEIGHTS_NAME,
     HF_ENABLE_PARALLEL_LOADING,
     SAFE_WEIGHTS_INDEX_NAME,
@@ -55,6 +56,7 @@
     is_accelerate_available,
     is_bitsandbytes_available,
     is_bitsandbytes_version,
+    is_flashpack_available,
     is_peft_available,
     is_torch_version,
     logging,
@@ -673,6 +675,7 @@ def save_pretrained(
         variant: str | None = None,
         max_shard_size: int | str = "10GB",
         push_to_hub: bool = False,
+        use_flashpack: bool = False,
         **kwargs,
     ):
         """
@@ -725,7 +728,12 @@ def save_pretrained(
                     " the logger on the traceback to understand the reason why the quantized model is not serializable."
                 )
 
-        weights_name = SAFETENSORS_WEIGHTS_NAME if safe_serialization else WEIGHTS_NAME
+        weights_name = WEIGHTS_NAME
+        if use_flashpack:
+            weights_name = FLASHPACK_WEIGHTS_NAME
+        elif safe_serialization:
+            weights_name = SAFETENSORS_WEIGHTS_NAME
+
         weights_name = _add_variant(weights_name, variant)
         weights_name_pattern = weights_name.replace(".bin", "{suffix}.bin").replace(
             ".safetensors", "{suffix}.safetensors"
@@ -752,58 +760,74 @@ def save_pretrained(
         # Save the model
         state_dict = model_to_save.state_dict()
 
-        # Save the model
-        state_dict_split = split_torch_state_dict_into_shards(
-            state_dict, max_shard_size=max_shard_size, filename_pattern=weights_name_pattern
-        )
-
-        # Clean the folder from a previous save
-        if is_main_process:
-            for filename in os.listdir(save_directory):
-                if filename in state_dict_split.filename_to_tensors.keys():
-                    continue
-                full_filename = os.path.join(save_directory, filename)
-                if not os.path.isfile(full_filename):
-                    continue
-                weights_without_ext = weights_name_pattern.replace(".bin", "").replace(".safetensors", "")
-                weights_without_ext = weights_without_ext.replace("{suffix}", "")
-                filename_without_ext = filename.replace(".bin", "").replace(".safetensors", "")
-                # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
-                if (
-                    filename.startswith(weights_without_ext)
-                    and _REGEX_SHARD.fullmatch(filename_without_ext) is not None
-                ):
-                    os.remove(full_filename)
-
-        for filename, tensors in state_dict_split.filename_to_tensors.items():
-            shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
-            filepath = os.path.join(save_directory, filename)
-            if safe_serialization:
-                # At some point we will need to deal better with save_function (used for TPU and other distributed
-                # joyfulness), but for now this enough.
-                safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"})
+        if use_flashpack:
+            if is_flashpack_available():
+                import flashpack
             else:
-                torch.save(shard, filepath)
+                logger.error(
+                    "Saving a FlashPack checkpoint in PyTorch, requires both PyTorch and flashpack to be installed. Please see "
+                    "https://pytorch.org/ and https://github.com/fal-ai/flashpack for installation instructions."
+                )
+                raise ImportError("Please install torch and flashpack to save a FlashPack checkpoint in PyTorch.")
 
-        if state_dict_split.is_sharded:
-            index = {
-                "metadata": state_dict_split.metadata,
-                "weight_map": state_dict_split.tensor_to_filename,
-            }
-            save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
-            save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
-            # Save the index as well
-            with open(save_index_file, "w", encoding="utf-8") as f:
-                content = json.dumps(index, indent=2, sort_keys=True) + "\n"
-                f.write(content)
-            logger.info(
-                f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
-                f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the "
-                f"index located at {save_index_file}."
+            flashpack.serialization.pack_to_file(
+                state_dict_or_model=state_dict,
+                destination_path=os.path.join(save_directory, weights_name),
+                target_dtype=self.dtype,
             )
         else:
-            path_to_weights = os.path.join(save_directory, weights_name)
-            logger.info(f"Model weights saved in {path_to_weights}")
+            # Save the model
+            state_dict_split = split_torch_state_dict_into_shards(
+                state_dict, max_shard_size=max_shard_size, filename_pattern=weights_name_pattern
+            )
+
+            # Clean the folder from a previous save
+            if is_main_process:
+                for filename in os.listdir(save_directory):
+                    if filename in state_dict_split.filename_to_tensors.keys():
+                        continue
+                    full_filename = os.path.join(save_directory, filename)
+                    if not os.path.isfile(full_filename):
+                        continue
+                    weights_without_ext = weights_name_pattern.replace(".bin", "").replace(".safetensors", "")
+                    weights_without_ext = weights_without_ext.replace("{suffix}", "")
+                    filename_without_ext = filename.replace(".bin", "").replace(".safetensors", "")
+                    # make sure that file to be deleted matches format of sharded file, e.g. pytorch_model-00001-of-00005
+                    if (
+                        filename.startswith(weights_without_ext)
+                        and _REGEX_SHARD.fullmatch(filename_without_ext) is not None
+                    ):
+                        os.remove(full_filename)
+
+            for filename, tensors in state_dict_split.filename_to_tensors.items():
+                shard = {tensor: state_dict[tensor].contiguous() for tensor in tensors}
+                filepath = os.path.join(save_directory, filename)
+                if safe_serialization:
+                    # At some point we will need to deal better with save_function (used for TPU and other distributed
+                    # joyfulness), but for now this enough.
+                    safetensors.torch.save_file(shard, filepath, metadata={"format": "pt"})
+                else:
+                    torch.save(shard, filepath)
+
+            if state_dict_split.is_sharded:
+                index = {
+                    "metadata": state_dict_split.metadata,
+                    "weight_map": state_dict_split.tensor_to_filename,
+                }
+                save_index_file = SAFE_WEIGHTS_INDEX_NAME if safe_serialization else WEIGHTS_INDEX_NAME
+                save_index_file = os.path.join(save_directory, _add_variant(save_index_file, variant))
+                # Save the index as well
+                with open(save_index_file, "w", encoding="utf-8") as f:
+                    content = json.dumps(index, indent=2, sort_keys=True) + "\n"
+                    f.write(content)
+                logger.info(
+                    f"The model is bigger than the maximum size per checkpoint ({max_shard_size}) and is going to be "
+                    f"split in {len(state_dict_split.filename_to_tensors)} checkpoint shards. You can find where each parameters has been saved in the "
+                    f"index located at {save_index_file}."
+                )
+            else:
+                path_to_weights = os.path.join(save_directory, weights_name)
+                logger.info(f"Model weights saved in {path_to_weights}")
 
         if push_to_hub:
             # Create a new empty model card and eventually tag it
@@ -940,6 +964,12 @@ def from_pretrained(cls, pretrained_model_name_or_path: str | os.PathLike | None
             disable_mmap ('bool', *optional*, defaults to 'False'):
                 Whether to disable mmap when loading a Safetensors model. This option can perform better when the model
                 is on a network mount or hard drive, which may not handle the seeky-ness of mmap very well.
+            use_flashpack (`bool`, *optional*, defaults to `False`):
+                If set to `True`, the model is loaded from `flashpack` weights.
+            flashpack_kwargs(`dict[str, Any]`, *optional*, defaults to `{}`):
+                Kwargs passed to
+                [`flashpack.deserialization.assign_from_file`](https://github.com/fal-ai/flashpack/blob/f1aa91c5cd9532a3dbf5bcc707ab9b01c274b76c/src/flashpack/deserialization.py#L408-L422)
+
 
         > [!TIP] > To use private or [gated models](https://huggingface.co/docs/hub/models-gated#gated-models), log-in
         with `hf > auth login`. You can also activate the special >
@@ -984,6 +1014,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: str | os.PathLike | None
         dduf_entries: dict[str, DDUFEntry] | None = kwargs.pop("dduf_entries", None)
         disable_mmap = kwargs.pop("disable_mmap", False)
         parallel_config: ParallelConfig | ContextParallelConfig | None = kwargs.pop("parallel_config", None)
+        use_flashpack = kwargs.pop("use_flashpack", False)
+        flashpack_kwargs = kwargs.pop("flashpack_kwargs", {})
 
         is_parallel_loading_enabled = HF_ENABLE_PARALLEL_LOADING
         if is_parallel_loading_enabled and not low_cpu_mem_usage:
@@ -1212,30 +1244,37 @@ def from_pretrained(cls, pretrained_model_name_or_path: str | os.PathLike | None
                     subfolder=subfolder or "",
                     dduf_entries=dduf_entries,
                 )
-            elif use_safetensors:
-                try:
-                    resolved_model_file = _get_model_file(
-                        pretrained_model_name_or_path,
-                        weights_name=_add_variant(SAFETENSORS_WEIGHTS_NAME, variant),
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        proxies=proxies,
-                        local_files_only=local_files_only,
-                        token=token,
-                        revision=revision,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                        commit_hash=commit_hash,
-                        dduf_entries=dduf_entries,
-                    )
+            else:
+                if use_flashpack:
+                    weights_name = FLASHPACK_WEIGHTS_NAME
+                elif use_safetensors:
+                    weights_name = _add_variant(SAFETENSORS_WEIGHTS_NAME, variant)
+                else:
+                    weights_name = None
+                if weights_name is not None:
+                    try:
+                        resolved_model_file = _get_model_file(
+                            pretrained_model_name_or_path,
+                            weights_name=weights_name,
+                            cache_dir=cache_dir,
+                            force_download=force_download,
+                            proxies=proxies,
+                            local_files_only=local_files_only,
+                            token=token,
+                            revision=revision,
+                            subfolder=subfolder,
+                            user_agent=user_agent,
+                            commit_hash=commit_hash,
+                            dduf_entries=dduf_entries,
+                        )
 
-                except IOError as e:
-                    logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
-                    if not allow_pickle:
-                        raise
-                    logger.warning(
-                        "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
-                    )
+                    except IOError as e:
+                        logger.error(f"An error occurred while trying to fetch {pretrained_model_name_or_path}: {e}")
+                        if not allow_pickle:
+                            raise
+                        logger.warning(
+                            "Defaulting to unsafe serialization. Pass `allow_pickle=False` to raise an error instead."
+                        )
 
             if resolved_model_file is None and not is_sharded:
                 resolved_model_file = _get_model_file(
@@ -1275,6 +1314,44 @@ def from_pretrained(cls, pretrained_model_name_or_path: str | os.PathLike | None
         with ContextManagers(init_contexts):
             model = cls.from_config(config, **unused_kwargs)
 
+        if use_flashpack:
+            if is_flashpack_available():
+                import flashpack
+            else:
+                logger.error(
+                    "Loading a FlashPack checkpoint in PyTorch, requires both PyTorch and flashpack to be installed. Please see "
+                    "https://pytorch.org/ and https://github.com/fal-ai/flashpack for installation instructions."
+                )
+                raise ImportError("Please install torch and flashpack to load a FlashPack checkpoint in PyTorch.")
+
+            if device_map is None:
+                logger.warning(
+                    "`device_map` has not been provided for FlashPack, model will be on `cpu` - provide `device_map` to fully utilize "
+                    "the benefit of FlashPack."
+                )
+                flashpack_device = torch.device("cpu")
+            else:
+                device = device_map[""]
+                if isinstance(device, str) and device in ["auto", "balanced", "balanced_low_0", "sequential"]:
+                    raise ValueError(
+                        "FlashPack `device_map` should not be one of `auto`, `balanced`, `balanced_low_0`, `sequential`. Use a specific device instead, e.g., `device_map='cuda'` or `device_map='cuda:0'"
+                    )
+                flashpack_device = torch.device(device) if not isinstance(device, torch.device) else device
+
+            flashpack.mixin.assign_from_file(
+                model=model,
+                path=resolved_model_file[0],
+                device=flashpack_device,
+                **flashpack_kwargs,
+            )
+            if dtype_orig is not None:
+                torch.set_default_dtype(dtype_orig)
+            if output_loading_info:
+                logger.warning("`output_loading_info` is not supported with FlashPack.")
+                return model, {}
+
+            return model
+
         if dtype_orig is not None:
             torch.set_default_dtype(dtype_orig)
 
 
@@ -28,6 +28,7 @@
 
 from .. import __version__
 from ..utils import (
+    FLASHPACK_WEIGHTS_NAME,
     FLAX_WEIGHTS_NAME,
     ONNX_EXTERNAL_WEIGHTS_NAME,
     ONNX_WEIGHTS_NAME,
@@ -194,6 +195,7 @@ def filter_model_files(filenames):
         FLAX_WEIGHTS_NAME,
         ONNX_WEIGHTS_NAME,
         ONNX_EXTERNAL_WEIGHTS_NAME,
+        FLASHPACK_WEIGHTS_NAME,
     ]
 
     if is_transformers_available():
@@ -413,6 +415,9 @@ def get_class_obj_and_candidates(
     """Simple helper method to retrieve class object of module as well as potential parent class objects"""
     component_folder = os.path.join(cache_dir, component_name) if component_name and cache_dir else None
 
+    if class_name.startswith("FlashPack"):
+        class_name = class_name.removeprefix("FlashPack")
+
     if is_pipeline_module:
         pipeline_module = getattr(pipelines, library_name)
 
@@ -760,6 +765,7 @@ def load_sub_model(
     provider_options: Any,
     disable_mmap: bool,
     quantization_config: Any | None = None,
+    use_flashpack: bool = False,
 ):
     """Helper method to load the module `name` from `library_name` and `class_name`"""
     from ..quantizers import PipelineQuantizationConfig
@@ -838,6 +844,9 @@ def load_sub_model(
         loading_kwargs["variant"] = model_variants.pop(name, None)
         loading_kwargs["use_safetensors"] = use_safetensors
 
+        if is_diffusers_model:
+            loading_kwargs["use_flashpack"] = use_flashpack
+
         if from_flax:
             loading_kwargs["from_flax"] = True
 
@@ -887,7 +896,7 @@ def load_sub_model(
         # else load from the root directory
         loaded_sub_model = load_method(cached_folder, **loading_kwargs)
 
-    if isinstance(loaded_sub_model, torch.nn.Module) and isinstance(device_map, dict):
+    if isinstance(loaded_sub_model, torch.nn.Module) and isinstance(device_map, dict) and not use_flashpack:
         # remove hooks
         remove_hook_from_module(loaded_sub_model, recurse=True)
         needs_offloading_to_cpu = device_map[""] == "cpu"
@@ -1093,6 +1102,7 @@ def _get_ignore_patterns(
     allow_pickle: bool,
     use_onnx: bool,
     is_onnx: bool,
+    use_flashpack: bool,
     variant: str | None = None,
 ) -> list[str]:
     if (
@@ -1118,6 +1128,9 @@ def _get_ignore_patterns(
         if not use_onnx:
             ignore_patterns += ["*.onnx", "*.pb"]
 
+    elif use_flashpack:
+        ignore_patterns = ["*.bin", "*.safetensors", "*.onnx", "*.pb", "*.msgpack"]
+
     else:
         ignore_patterns = ["*.safetensors", "*.msgpack"]
Original file line number	Diff line number	Diff line change
`@@ -53,4 +53,5 @@`
`53`	`53`	`"phonemizer": "phonemizer",`
`54`	`54`	`"opencv-python": "opencv-python",`
`55`	`55`	`"timm": "timm",`
	`56`	`+ "flashpack": "flashpack",`
`56`	`57`	`}`