supports flash attn 3 fp8 (#174)

akaitsuki-ii · web-flow · commit 28338b9d5d83 · 2025-09-19T17:22:37.000+08:00
diff --git a/diffsynth_engine/__init__.py b/diffsynth_engine/__init__.py
@@ -11,6 +11,7 @@
     FluxStateDicts,
     WanStateDicts,
     QwenImageStateDicts,
+    AttnImpl,
     ControlNetParams,
     ControlType,
 )
@@ -54,6 +55,7 @@
     "FluxStateDicts",
     "WanStateDicts",
     "QwenImageStateDicts",
+    "AttnImpl",
     "ControlNetParams",
     "ControlType",
     "SDImagePipeline",
diff --git a/diffsynth_engine/configs/__init__.py b/diffsynth_engine/configs/__init__.py
@@ -17,6 +17,7 @@
     WanStateDicts,
     WanS2VStateDicts,
     QwenImageStateDicts,
+    AttnImpl,
 )
 from .controlnet import ControlType, ControlNetParams
 
@@ -39,6 +40,7 @@
     "WanStateDicts",
     "WanS2VStateDicts",
     "QwenImageStateDicts",
+    "AttnImpl",
     "ControlType",
     "ControlNetParams",
 ]
diff --git a/diffsynth_engine/configs/pipeline.py b/diffsynth_engine/configs/pipeline.py
@@ -1,5 +1,6 @@
 import os
 import torch
+from enum import Enum
 from dataclasses import dataclass, field
 from typing import List, Dict, Tuple, Optional
 
@@ -19,9 +20,21 @@ class BaseConfig:
     offload_to_disk: bool = False
 
 
+class AttnImpl(Enum):
+    AUTO = "auto"
+    EAGER = "eager"  # Native Attention
+    FA2 = "fa2"  # Flash Attention 2
+    FA3 = "fa3"  # Flash Attention 3
+    FA3_FP8 = "fa3_fp8"  # Flash Attention 3 with FP8
+    XFORMERS = "xformers"  # XFormers
+    SDPA = "sdpa"  # Scaled Dot Product Attention
+    SAGE = "sage"  # Sage Attention
+    SPARGE = "sparge"  # Sparge Attention
+
+
 @dataclass
 class AttentionConfig:
-    dit_attn_impl: str = "auto"
+    dit_attn_impl: AttnImpl = AttnImpl.AUTO
     # Sparge Attention
     sparge_smooth_k: bool = True
     sparge_cdfthreshd: float = 0.6
diff --git a/diffsynth_engine/models/basic/attention.py b/diffsynth_engine/models/basic/attention.py
@@ -13,6 +13,7 @@
     SAGE_ATTN_AVAILABLE,
     SPARGE_ATTN_AVAILABLE,
 )
+from diffsynth_engine.utils.platform import DTYPE_FP8
 
 FA3_MAX_HEADDIM = 256
 
@@ -125,12 +126,13 @@ def attention(
         None,
         "auto",
         "eager",
-        "flash_attn_2",
-        "flash_attn_3",
+        "fa2",
+        "fa3",
+        "fa3_fp8",
         "xformers",
         "sdpa",
-        "sage_attn",
-        "sparge_attn",
+        "sage",
+        "sparge",
     ]
     flash_attn3_compatible = q.shape[-1] <= FA3_MAX_HEADDIM
     if attn_impl is None or attn_impl == "auto":
@@ -139,9 +141,13 @@ def attention(
                 return flash_attn3(q, k, v, softmax_scale=scale)
             else:
                 if not flash_attn3_compatible:
-                    logger.warning(f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}, will use fallback attention implementation")
+                    logger.warning(
+                        f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}, will use fallback attention implementation"
+                    )
                 else:
-                    logger.debug("flash_attn_3 does not support attention mask, will use fallback attention implementation")
+                    logger.debug(
+                        "flash_attn_3 does not support attention mask, will use fallback attention implementation"
+                    )
         if XFORMERS_AVAILABLE:
             return xformers_attn(q, k, v, attn_mask=attn_mask, scale=scale)
         if SDPA_AVAILABLE:
@@ -152,23 +158,31 @@ def attention(
     else:
         if attn_impl == "eager":
             return eager_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        if attn_impl == "flash_attn_3":
+        if attn_impl == "fa3" or attn_impl == "fa3_fp8":
             if not flash_attn3_compatible:
                 raise RuntimeError(
                     f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}"
                 )
             if attn_mask is not None:
                 raise RuntimeError("flash_attn_3 does not support attention mask")
-            return flash_attn3(q, k, v, softmax_scale=scale)
-        if attn_impl == "flash_attn_2":
+            if attn_impl == "fa3":
+                return flash_attn3(q, k, v, softmax_scale=scale)
+            else:
+                origin_dtype = q.dtype
+                q = q.to(dtype=DTYPE_FP8)
+                k = k.to(dtype=DTYPE_FP8)
+                v = v.to(dtype=DTYPE_FP8)
+                out = flash_attn3(q, k, v, softmax_scale=scale)
+                return out.to(dtype=origin_dtype)
+        if attn_impl == "fa2":
             return flash_attn2(q, k, v, softmax_scale=scale)
         if attn_impl == "xformers":
             return xformers_attn(q, k, v, attn_mask=attn_mask, scale=scale)
         if attn_impl == "sdpa":
             return sdpa_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        if attn_impl == "sage_attn":
+        if attn_impl == "sage":
             return sage_attn(q, k, v, attn_mask=attn_mask, scale=scale)
-        if attn_impl == "sparge_attn":
+        if attn_impl == "sparge":
             return sparge_attn(
                 q,
                 k,
@@ -247,12 +261,14 @@ def long_context_attention(
     assert attn_impl in [
         None,
         "auto",
-        "flash_attn_2",
-        "flash_attn_3",
+        "fa2",
+        "fa3",
+        "fa3_fp8",
         "sdpa",
-        "sage_attn",
-        "sparge_attn",
+        "sage",
+        "sparge",
     ]
+    assert attn_mask is None, "long context attention does not support attention mask"
     flash_attn3_compatible = q.shape[-1] <= FA3_MAX_HEADDIM
     if attn_impl is None or attn_impl == "auto":
         if FLASH_ATTN_3_AVAILABLE:
@@ -268,20 +284,27 @@ def long_context_attention(
             return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
         raise ValueError("No available long context attention implementation")
     else:
-        if attn_impl == "flash_attn_3":
-            if flash_attn3_compatible:
-                return LongContextAttention(attn_type=AttnType.FA3)(q, k, v, softmax_scale=scale)
-            else:
+        if attn_impl == "fa3" or attn_impl == "fa3_fp8":
+            if not flash_attn3_compatible:
                 raise RuntimeError(
                     f"head_dim={q.shape[-1]}, but flash_attn_3 only supports head dimension at most {FA3_MAX_HEADDIM}"
                 )
-        if attn_impl == "flash_attn_2":
+            if attn_impl == "fa3":
+                return LongContextAttention(attn_type=AttnType.FA3)(q, k, v, softmax_scale=scale)
+
+            origin_dtype = q.dtype
+            q = q.to(dtype=DTYPE_FP8)
+            k = k.to(dtype=DTYPE_FP8)
+            v = v.to(dtype=DTYPE_FP8)
+            out = LongContextAttention(attn_type=AttnType.FA3)(q, k, v, softmax_scale=scale)
+            return out.to(dtype=origin_dtype)
+        if attn_impl == "fa2":
             return LongContextAttention(attn_type=AttnType.FA)(q, k, v, softmax_scale=scale)
         if attn_impl == "sdpa":
             return LongContextAttention(attn_type=AttnType.TORCH)(q, k, v, softmax_scale=scale)
-        if attn_impl == "sage_attn":
-            return LongContextAttention(attn_type=AttnType.SAGE_FP8)(q, k, v, softmax_scale=scale)
-        if attn_impl == "sparge_attn":
+        if attn_impl == "sage":
+            return LongContextAttention(attn_type=AttnType.SAGE_AUTO)(q, k, v, softmax_scale=scale)
+        if attn_impl == "sparge":
             attn_processor = SparseAttentionMeansim()
             # default args from spas_sage2_attn_meansim_cuda
             attn_processor.smooth_k = torch.tensor(kwargs.get("sparge_smooth_k", True))
diff --git a/diffsynth_engine/pipelines/flux_image.py b/diffsynth_engine/pipelines/flux_image.py
@@ -516,7 +516,7 @@ def _from_state_dict(cls, state_dicts: FluxStateDicts, config: FluxPipelineConfi
 
         with LoRAContext():
             attn_kwargs = {
-                "attn_impl": config.dit_attn_impl,
+                "attn_impl": config.dit_attn_impl.value,
                 "sparge_smooth_k": config.sparge_smooth_k,
                 "sparge_cdfthreshd": config.sparge_cdfthreshd,
                 "sparge_simthreshd1": config.sparge_simthreshd1,
diff --git a/diffsynth_engine/pipelines/qwen_image.py b/diffsynth_engine/pipelines/qwen_image.py
@@ -201,7 +201,7 @@ def _from_state_dict(cls, state_dicts: QwenImageStateDicts, config: QwenImagePip
 
         with LoRAContext():
             attn_kwargs = {
-                "attn_impl": config.dit_attn_impl,
+                "attn_impl": config.dit_attn_impl.value,
                 "sparge_smooth_k": config.sparge_smooth_k,
                 "sparge_cdfthreshd": config.sparge_cdfthreshd,
                 "sparge_simthreshd1": config.sparge_simthreshd1,
diff --git a/diffsynth_engine/pipelines/wan_s2v.py b/diffsynth_engine/pipelines/wan_s2v.py
@@ -239,7 +239,15 @@ def encode_ref_and_motion(
 
         return ref_latents, motion_latents, motion_frames
 
-    def encode_pose(self, pose_video: List[Image.Image], pose_video_fps: int, num_clips: int, num_frames_per_clip: int, height: int, width: int):
+    def encode_pose(
+        self,
+        pose_video: List[Image.Image],
+        pose_video_fps: int,
+        num_clips: int,
+        num_frames_per_clip: int,
+        height: int,
+        width: int,
+    ):
         self.load_models_to_device(["vae"])
         max_num_pose_frames = num_frames_per_clip * num_clips
         pose_video = read_n_frames(pose_video, pose_video_fps, max_num_pose_frames, target_fps=self.config.fps)
@@ -466,7 +474,9 @@ def __call__(
                 dtype=self.dtype,
             ).to(self.device)
         if pose_video is not None:
-            pose_latents_all_clips = self.encode_pose(pose_video, pose_video_fps, num_clips, num_frames_per_clip, height, width)
+            pose_latents_all_clips = self.encode_pose(
+                pose_video, pose_video_fps, num_clips, num_frames_per_clip, height, width
+            )
 
         output_frames_all_clips = []
         for clip_idx in range(num_clips):
@@ -602,7 +612,9 @@ def from_pretrained(cls, model_path_or_config: WanSpeech2VideoPipelineConfig) ->
         return cls.from_state_dict(state_dicts, config)
 
     @classmethod
-    def from_state_dict(cls, state_dicts: WanS2VStateDicts, config: WanSpeech2VideoPipelineConfig) -> "WanSpeech2VideoPipeline":
+    def from_state_dict(
+        cls, state_dicts: WanS2VStateDicts, config: WanSpeech2VideoPipelineConfig
+    ) -> "WanSpeech2VideoPipeline":
         if config.parallelism > 1:
             pipe = ParallelWrapper(
                 cfg_degree=config.cfg_degree,
@@ -617,7 +629,9 @@ def from_state_dict(cls, state_dicts: WanS2VStateDicts, config: WanSpeech2VideoP
         return pipe
 
     @classmethod
-    def _from_state_dict(cls, state_dicts: WanS2VStateDicts, config: WanSpeech2VideoPipelineConfig) -> "WanSpeech2VideoPipeline":
+    def _from_state_dict(
+        cls, state_dicts: WanS2VStateDicts, config: WanSpeech2VideoPipelineConfig
+    ) -> "WanSpeech2VideoPipeline":
         # default params from model config
         vae_type = "wan2.1-vae"
         dit_type = "wan2.2-s2v-14b"
@@ -632,14 +646,16 @@ def _from_state_dict(cls, state_dicts: WanS2VStateDicts, config: WanSpeech2Video
         init_device = "cpu" if config.offload_mode is not None else config.device
         tokenizer = WanT5Tokenizer(WAN_TOKENIZER_CONF_PATH, seq_len=512, clean="whitespace")
         text_encoder = WanTextEncoder.from_state_dict(state_dicts.t5, device=init_device, dtype=config.t5_dtype)
-        vae = WanVideoVAE.from_state_dict(state_dicts.vae, config=vae_config, device=init_device, dtype=config.vae_dtype)
+        vae = WanVideoVAE.from_state_dict(
+            state_dicts.vae, config=vae_config, device=init_device, dtype=config.vae_dtype
+        )
         audio_encoder = Wav2Vec2Model.from_state_dict(
             state_dicts.audio_encoder, config=Wav2Vec2Config(), device=init_device, dtype=config.audio_encoder_dtype
         )
 
         with LoRAContext():
             attn_kwargs = {
-                "attn_impl": config.dit_attn_impl,
+                "attn_impl": config.dit_attn_impl.value,
                 "sparge_smooth_k": config.sparge_smooth_k,
                 "sparge_cdfthreshd": config.sparge_cdfthreshd,
                 "sparge_simthreshd1": config.sparge_simthreshd1,
diff --git a/diffsynth_engine/pipelines/wan_video.py b/diffsynth_engine/pipelines/wan_video.py
@@ -557,7 +557,7 @@ def _from_state_dict(cls, state_dicts: WanStateDicts, config: WanPipelineConfig)
 
         with LoRAContext():
             attn_kwargs = {
-                "attn_impl": config.dit_attn_impl,
+                "attn_impl": config.dit_attn_impl.value,
                 "sparge_smooth_k": config.sparge_smooth_k,
                 "sparge_cdfthreshd": config.sparge_cdfthreshd,
                 "sparge_simthreshd1": config.sparge_simthreshd1,

Original file line number	Diff line number	Diff line change
`@@ -17,6 +17,7 @@`
`17`	`17`	`WanStateDicts,`
`18`	`18`	`WanS2VStateDicts,`
`19`	`19`	`QwenImageStateDicts,`
	`20`	`+ AttnImpl,`
`20`	`21`	`)`
`21`	`22`	`from .controlnet import ControlType, ControlNetParams`
`22`	`23`
`@@ -39,6 +40,7 @@`
`39`	`40`	`"WanStateDicts",`
`40`	`41`	`"WanS2VStateDicts",`
`41`	`42`	`"QwenImageStateDicts",`
	`43`	`+ "AttnImpl",`
`42`	`44`	`"ControlType",`
`43`	`45`	`"ControlNetParams",`
`44`	`46`	`]`