refactor vae encoder block

yiyixuxu · yiyixuxu · commit 1589e75a00ed · 2025-11-08T08:42:33.000+01:00
diff --git a/src/diffusers/modular_pipelines/wan/before_denoise.py b/src/diffusers/modular_pipelines/wan/before_denoise.py
@@ -269,7 +269,7 @@ class WanInputsDynamicStep(ModularPipelineBlocks):
 
     def __init__(
         self,
-        image_latent_inputs: List[str] = ["first_frame_latents"],
+        image_latent_inputs: List[str] = ["condition_latents"],
         additional_batch_inputs: List[str] = ["image_embeds"],
     ):
         """Initialize a configurable step that standardizes the inputs for the denoising step. It:\n"
@@ -559,15 +559,15 @@ def description(self) -> str:
     @property
     def inputs(self) -> List[InputParam]:
         return [
-            InputParam("first_frame_latents", type_hint=Optional[torch.Tensor]),
+            InputParam("condition_latents", type_hint=Optional[torch.Tensor]),
             InputParam("num_frames", type_hint=int),
         ]
     
     
     def __call__(self, components: WanModularPipeline, state: PipelineState) -> PipelineState:
         block_state = self.get_block_state(state)
 
-        batch_size, _, _, latent_height, latent_width = block_state.first_frame_latents.shape
+        batch_size, _, _, latent_height, latent_width = block_state.condition_latents.shape
 
         mask_lat_size = torch.ones(batch_size, 1, block_state.num_frames, latent_height, latent_width)
         mask_lat_size[:, :, list(range(1, block_state.num_frames))] = 0
@@ -577,8 +577,8 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
         mask_lat_size = torch.concat([first_frame_mask, mask_lat_size[:, :, 1:, :]], dim=2)
         mask_lat_size = mask_lat_size.view(batch_size, -1, components.vae_scale_factor_temporal, latent_height, latent_width)
         mask_lat_size = mask_lat_size.transpose(1, 2)
-        mask_lat_size = mask_lat_size.to(block_state.first_frame_latents.device)
-        block_state.first_frame_latents = torch.concat([mask_lat_size, block_state.first_frame_latents], dim=1)
+        mask_lat_size = mask_lat_size.to(block_state.condition_latents.device)
+        block_state.condition_latents = torch.concat([mask_lat_size, block_state.condition_latents], dim=1)
 
         self.set_block_state(state, block_state)
         return components, state
diff --git a/src/diffusers/modular_pipelines/wan/denoise.py b/src/diffusers/modular_pipelines/wan/denoise.py
@@ -84,16 +84,16 @@ def inputs(self) -> List[InputParam]:
                 description="The initial latents to use for the denoising process. Can be generated in prepare_latent step.",
             ),
             InputParam(
-                "first_frame_latents",
+                "condition_latents",
                 required=True,
                 type_hint=torch.Tensor,
-                description="The first frame latents to use for the denoising process. Can be generated in prepare_first_frame_latents step.",
+                description="The condition latents to use for the denoising process. Can be generated in prepare_condition_latents step.",
             ),
         ]
 
     @torch.no_grad()
     def __call__(self, components: WanModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
-        block_state.latent_model_input = torch.cat([block_state.latents, block_state.first_frame_latents], dim=1)
+        block_state.latent_model_input = torch.cat([block_state.latents, block_state.condition_latents], dim=1)
         return components, block_state
 
 class WanLoopDenoiserDynamic(ModularPipelineBlocks):
diff --git a/src/diffusers/modular_pipelines/wan/encoders.py b/src/diffusers/modular_pipelines/wan/encoders.py
@@ -117,55 +117,42 @@ def retrieve_latents(
 
 
 def encode_vae_image(
-    image: torch.Tensor,
+    video_tensor: torch.Tensor,
     vae: AutoencoderKLWan,
     generator: torch.Generator,
     device: torch.device,
     dtype: torch.dtype,
-    num_frames: int = 81,
-    height: int = 480,
-    width: int = 832,
     latent_channels: int = 16,
 ):
-    if not isinstance(image, torch.Tensor):
-        raise ValueError(f"Expected image to be a tensor, got {type(image)}.")
+    if not isinstance(video_tensor, torch.Tensor):
+        raise ValueError(f"Expected video_tensor to be a tensor, got {type(video_tensor)}.")
 
-    if isinstance(generator, list) and len(generator) != image.shape[0]:
-        raise ValueError(f"You have passed a list of generators of length {len(generator)}, but it is not same as number of images {image.shape[0]}.")
+    if isinstance(generator, list) and len(generator) != video_tensor.shape[0]:
+        raise ValueError(f"You have passed a list of generators of length {len(generator)}, but it is not same as number of images {video_tensor.shape[0]}.")
 
-    # preprocessed image should be a 4D tensor: batch_size, num_channels, height, width
-    if image.dim() == 4:
-        image = image.unsqueeze(2)
-    elif image.dim() != 5:
-        raise ValueError(f"Expected image dims 4 or 5, got {image.dim()}.")
-
-    video_condition = torch.cat(
-        [image, image.new_zeros(image.shape[0], image.shape[1], num_frames - 1, height, width)], dim=2
-    )
-
-    video_condition = video_condition.to(device=device, dtype=dtype)
+    video_tensor = video_tensor.to(device=device, dtype=dtype)
 
     if isinstance(generator, list):
-        latent_condition = [
-            retrieve_latents(vae.encode(video_condition[i : i + 1]), generator=generator[i], sample_mode="argmax") for i in range(image.shape[0])
+        video_latents = [
+            retrieve_latents(vae.encode(video_tensor[i : i + 1]), generator=generator[i], sample_mode="argmax") for i in range(video_tensor.shape[0])
         ]
-        latent_condition = torch.cat(latent_condition, dim=0)
+        video_latents = torch.cat(video_latents, dim=0)
     else:
-        latent_condition = retrieve_latents(vae.encode(video_condition), sample_mode="argmax")
+        video_latents = retrieve_latents(vae.encode(video_tensor), sample_mode="argmax")
 
     latents_mean = (
         torch.tensor(vae.config.latents_mean)
         .view(1, latent_channels, 1, 1, 1)
-        .to(latent_condition.device, latent_condition.dtype)
+        .to(video_latents.device, video_latents.dtype)
     )
     latents_std = (
         1.0 / torch.tensor(vae.config.latents_std)
         .view(1, latent_channels, 1, 1, 1)
-        .to(latent_condition.device, latent_condition.dtype)
+        .to(video_latents.device, video_latents.dtype)
     )
-    latent_condition = (latent_condition - latents_mean) * latents_std
+    video_latents = (video_latents - latents_mean) * latents_std
 
-    return latent_condition
+    return video_latents
 
 
 
@@ -441,7 +428,7 @@ class WanVaeImageEncoderStep(ModularPipelineBlocks):
 
     @property
     def description(self) -> str:
-        return "Vae Image Encoder step that generate first_frame_latents to guide the video generation"
+        return "Vae Image Encoder step that generate condition_latents to guide the video generation"
 
     @property
     def expected_components(self) -> List[ComponentSpec]:
@@ -463,7 +450,7 @@ def inputs(self) -> List[InputParam]:
     @property
     def intermediate_outputs(self) -> List[OutputParam]:
         return [
-            OutputParam("first_frame_latents", type_hint=torch.Tensor, description="The latent condition"),
+            OutputParam("condition_latents", type_hint=torch.Tensor, description="The condition latents"),
         ]
     
     @staticmethod
@@ -497,18 +484,21 @@ def __call__(self, components: WanModularPipeline, state: PipelineState) -> Pipe
         image_tensor = components.video_processor.preprocess(
             image, height=height, width=width).to(device=device, dtype=dtype)
 
-        latent_condition = encode_vae_image(
-            image=image_tensor,
+        if image_tensor.dim() == 4:
+            image_tensor = image_tensor.unsqueeze(2)
+
+        video_tensor = torch.cat(
+            [image_tensor, image_tensor.new_zeros(image_tensor.shape[0], image_tensor.shape[1], num_frames - 1, height, width)], dim=2
+        ).to(device=device, dtype=dtype)
+
+        block_state.condition_latents = encode_vae_image(
+            video_tensor=video_tensor,
             vae=components.vae,
             generator=block_state.generator,
             device=device,
             dtype=dtype,
-            num_frames=num_frames,
-            height=height,
-            width=width,
             latent_channels=components.num_channels_latents,
         )
 
-        block_state.first_frame_latents = latent_condition
         self.set_block_state(state, block_state)
         return components, state
diff --git a/src/diffusers/modular_pipelines/wan/modular_blocks.py b/src/diffusers/modular_pipelines/wan/modular_blocks.py
@@ -81,7 +81,7 @@ def description(self):
 class WanImage2VideoCoreDenoiseStep(SequentialPipelineBlocks):
     block_classes = [
         WanTextInputStep,
-        WanInputsDynamicStep(image_latent_inputs=["first_frame_latents"]),
+        WanInputsDynamicStep(image_latent_inputs=["condition_latents"]),
         WanSetTimestepsStep,
         WanPrepareLatentsStep,
         WanPrepareFirstFrameLatentsStep,
@@ -137,7 +137,7 @@ class WanAutoDenoiseStep(AutoPipelineBlocks):
         WanCoreDenoiseStep,
     ]
     block_names = ["image2video", "text2video"]
-    block_trigger_inputs = ["first_frame_latents", None]
+    block_trigger_inputs = ["condition_latents", None]
 
     @property
     def description(self) -> str:
@@ -193,7 +193,7 @@ def description(self):
         ("image_encoder", WanImage2VideoImageEncoderStep),
         ("vae_image_encoder", WanImage2VideoVaeImageEncoderStep),
         ("input", WanTextInputStep),
-        ("additional_inputs", WanInputsDynamicStep(image_latent_inputs=["first_frame_latents"])),
+        ("additional_inputs", WanInputsDynamicStep(image_latent_inputs=["condition_latents"])),
         ("set_timesteps", WanSetTimestepsStep),
         ("prepare_latents", WanPrepareLatentsStep),
         ("denoise", WanImage2VideoCoreDenoiseStep),