invoke-ai · RyanJDick · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
@@ -20,7 +20,6 @@
 from invokeai.app.invocations.primitives import LatentsOutput
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.backend.flux.denoise import denoise
-from invokeai.backend.flux.inpaint_extension import InpaintExtension
 from invokeai.backend.flux.model import Flux
 from invokeai.backend.flux.sampling_utils import (
     clip_timestep_schedule,
@@ -30,6 +29,7 @@
     pack,
     unpack,
 )
+from invokeai.backend.flux.trajectory_guidance_extension import TrajectoryGuidanceExtension
 from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
 from invokeai.backend.lora.lora_patcher import LoRAPatcher
 from invokeai.backend.model_manager.config import ModelFormat
@@ -68,6 +68,12 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
         description=FieldDescriptions.denoising_start,
     )
     denoising_end: float = InputField(default=1.0, ge=0, le=1, description=FieldDescriptions.denoising_end)
+    trajectory_guidance_strength: float = InputField(
+        default=0.0,
+        ge=0.0,
+        le=1.0,
+        description="Value indicating how strongly to guide the denoising process towards the initial latents (during image-to-image). Range [0, 1]. A value of 0.0 is equivalent to vanilla image-to-image. A value of 1.0 will guide the denoising process very close to the original latents.",
+    )
     transformer: TransformerField = InputField(
         description=FieldDescriptions.flux_model,
         input=Input.Connection,
@@ -181,14 +187,13 @@ def _run_diffusion(
         # Now that we have 'packed' the latent tensors, verify that we calculated the image_seq_len correctly.
         assert image_seq_len == x.shape[1]
 
-        # Prepare inpaint extension.
-        inpaint_extension: InpaintExtension | None = None
-        if inpaint_mask is not None:
-            assert init_latents is not None
-            inpaint_extension = InpaintExtension(
+        # Prepare trajectory guidance extension.
+        traj_guidance_extension: TrajectoryGuidanceExtension | None = None
+        if init_latents is not None:
+            traj_guidance_extension = TrajectoryGuidanceExtension(
                 init_latents=init_latents,
                 inpaint_mask=inpaint_mask,
-                noise=noise,
+                trajectory_guidance_strength=self.trajectory_guidance_strength,
             )
 
         with (
@@ -236,7 +241,7 @@ def _run_diffusion(
                 timesteps=timesteps,
                 step_callback=self._build_step_callback(context),
                 guidance=self.guidance,
-                inpaint_extension=inpaint_extension,
+                traj_guidance_extension=traj_guidance_extension,
             )
 
         x = unpack(x.float(), self.height, self.width)

@@ -3,8 +3,8 @@
 import torch
 from tqdm import tqdm
 
-from invokeai.backend.flux.inpaint_extension import InpaintExtension
 from invokeai.backend.flux.model import Flux
+from invokeai.backend.flux.trajectory_guidance_extension import TrajectoryGuidanceExtension
 from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
 
 
@@ -20,7 +20,7 @@ def denoise(
     timesteps: list[float],
     step_callback: Callable[[PipelineIntermediateState], None],
     guidance: float,
-    inpaint_extension: InpaintExtension | None,
+    traj_guidance_extension: TrajectoryGuidanceExtension | None,  # noqa: F821
 ):
     step = 0
     # guidance_vec is ignored for schnell.
@@ -36,12 +36,14 @@ def denoise(
             timesteps=t_vec,
             guidance=guidance_vec,
         )
-        preview_img = img - t_curr * pred
-        img = img + (t_prev - t_curr) * pred
 
-        if inpaint_extension is not None:
-            img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
-            preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
+        if traj_guidance_extension is not None:
+            img = traj_guidance_extension.step(t_curr_latents=img, pred_noise=pred, t_curr=t_curr, t_prev=t_prev)
+            # TODO(ryand): Generate a better preview image.
+            preview_img = img
+        else:
+            preview_img = img - t_curr * pred
+            img = img + (t_prev - t_curr) * pred
 
         step_callback(
             PipelineIntermediateState(

@@ -0,0 +1,125 @@
+import torch
+
+from invokeai.backend.util.build_line import build_line
+
+
+class TrajectoryGuidanceExtension:
+    """An implementation of trajectory guidance for FLUX.
+
+    What is trajectory guidance?
+    ----------------------------
+    With SD 1 and SDXL, the amount of change in image-to-image denoising is largely controlled by the denoising_start
+    parameter. Doing the same thing with the FLUX model does not work as well, because the FLUX model converges very
+    quickly (roughly time 1.0 to 0.9) to the structure of the final image. The result of this model characteristic is
+    that you typically get one of two outcomes:
+    1) a result that is very similar to the original image
+    2) a result that is very different from the original image, as though it was generated from the text prompt with
+       pure noise.
+
+    To address this issue with image-to-image workflows with FLUX, we employ the concept of trajectory guidance. The
+    idea is that in addition to controlling the denoising_start parameter (i.e. the amount of noise added to the
+    original image), we can also guide the denoising process to stay close to the trajectory that would reproduce the
+    original. By controlling the strength of the trajectory guidance throughout the denoising process, we can achieve
+    FLUX image-to-image behavior with the same level of control offered by SD1 and SDXL.
+
+    What is the trajectory_guidance_strength?
+    -----------------------------------------
+    In the limit, we could apply a different trajectory guidance 'strength' for every latent value in every timestep.
+    This would be impractical for a user, so instead we have engineered a strength schedule that is more convenient to
+    use. The `trajectory_guidance_strength` parameter is a single scalar value that maps to a schedule. The engineered
+    schedule is defined as:
+    1) An initial change_ratio at t=1.0.
+    2) A linear ramp up to change_ratio=1.0 at t = t_cutoff.
+    3) A constant change_ratio=1.0 after t = t_cutoff.
+    """
+
+    def __init__(
+        self, init_latents: torch.Tensor, inpaint_mask: torch.Tensor | None, trajectory_guidance_strength: float
+    ):
+        """Initialize TrajectoryGuidanceExtension.
+
+        Args:
+            init_latents (torch.Tensor): The initial latents (i.e. un-noised at timestep 0). In 'packed' format.
+            inpaint_mask (torch.Tensor | None): A mask specifying which elements to inpaint. Range [0, 1]. Values of 1
+                will be re-generated. Values of 0 will remain unchanged. Values between 0 and 1 can be used to blend the
+                inpainted region with the background. In 'packed' format. If None, will be treated as a mask of all 1s.
+            trajectory_guidance_strength (float): A value in [0, 1] specifying the strength of the trajectory guidance.
+                A value of 0.0 is equivalent to vanilla image-to-image. A value of 1.0 will guide the denoising process
+                very close to the original latents.
+        """
+        assert 0.0 <= trajectory_guidance_strength <= 1.0
+        self._init_latents = init_latents
+        self._trajectory_guidance_strength = trajectory_guidance_strength
+        if inpaint_mask is None:
+            # The inpaing mask is None, so we initialize a mask with a single value of 1.0.
+            # This value will be broadcasted and treated as a mask of all 1s.
+            self._inpaint_mask = torch.ones(1, device=init_latents.device, dtype=init_latents.dtype)
+        else:
+            self._inpaint_mask = inpaint_mask
+
+    def _apply_mask_gradient_adjustment(self, t_prev: float) -> torch.Tensor:
+        """Applies inpaint mask gradient adjustment and returns the inpaint mask to be used at the current timestep."""
+        # As we progress through the denoising process, we promote gradient regions of the mask to have a full weight of
+        # 1.0. This helps to produce more coherent seams around the inpainted region. We experimented with a (small)
+        # number of promotion strategies (e.g. gradual promotion based on timestep), but found that a simple cutoff
+        # threshold worked well.
+        # We use a small epsilon to avoid any potential issues with floating point precision.
+        eps = 1e-4
+        mask_gradient_t_cutoff = 0.5
+        if t_prev > mask_gradient_t_cutoff:
+            # Early in the denoising process, use the inpaint mask as-is.
+            return self._inpaint_mask
+        else:
+            # After the cut-off, promote all non-zero mask values to 1.0.
+            mask = self._inpaint_mask.where(self._inpaint_mask <= (0.0 + eps), 1.0)
+
+        return mask
+
+    def step(
+        self, t_curr_latents: torch.Tensor, pred_noise: torch.Tensor, t_curr: float, t_prev: float
+    ) -> torch.Tensor:
+        # Handle gradient cutoff.
+        mask = self._apply_mask_gradient_adjustment(t_prev)
+
+        # Calculate the change_ratio based on the trajectory_guidance_strength.
+        # These mappings from trajectory_guidance_strength have no theoretical basis - they were tuned manually.
+        change_ratio_at_t_1 = build_line(x1=0.0, y1=1.0, x2=1.0, y2=0.0)(self._trajectory_guidance_strength)
+        change_ratio_at_cutoff = 1.0
+        t_cutoff = build_line(x1=0.0, y1=1.0, x2=1.0, y2=0.5)(self._trajectory_guidance_strength)
+        change_ratio = 1.0
+        if t_prev > t_cutoff:
+            # If we are before the cutoff, linearly interpolate between the change_ratio at t=1.0 and the change_ratio
+            # at the cutoff.
+            change_ratio = build_line(x1=1.0, y1=change_ratio_at_t_1, x2=t_cutoff, y2=change_ratio_at_cutoff)(t_prev)
+
+        mask = mask * change_ratio
+
+        # NOTE(ryand): During inpainting, it is common to guide the denoising process by noising the initial latents for
+        # the current timestep and then blending the predicted intermediate latents with the noised initial latents.
+        # For example:
+        # ```
+        # noised_init_latents = self._noise * t_prev + (1.0 - t_prev) * self._init_latents
+        # return t_prev_latents * self._inpaint_mask + noised_init_latents * (1.0 - self._inpaint_mask)
+        # ```
+        # Instead of guiding based on the noised initial latents, we have decided to guide based on the noise prediction
+        # that points towards the initial latents. The difference between these guidance strategies is minor, but
+        # qualitatively we found the latter to produce slightly better results. When change_ratio is 0.0 or 1.0 there is
+        # no difference between the two strategies.
+        #
+        # We experimented with a number of related guidance strategies, but not exhaustively. It's entirely possible
+        # that there's a much better way to do this.
+
+        # Calculate noise guidance
+        # What noise should the model have predicted at this timestep to step towards self._init_latents?
+        # Derivation:
+        # > t_prev_latents = t_curr_latents + (t_prev - t_curr) * pred_noise
+        # > t_0_latents = t_curr_latents + (0 - t_curr) * init_traj_noise
+        # > t_0_latents = t_curr_latents - t_curr * init_traj_noise
+        # > init_traj_noise = (t_curr_latents - t_0_latents) / t_curr)
+        init_traj_noise = (t_curr_latents - self._init_latents) / t_curr
+
+        # Blend the init_traj_noise with the pred_noise according to the inpaint mask and the trajectory guidance.
+        noise = pred_noise * mask + init_traj_noise * (1.0 - mask)
+
+        # Take a denoising step.
+        return t_curr_latents + (t_prev - t_curr) * noise
@@ -0,0 +1,6 @@
+from typing import Callable
+
+
+def build_line(x1: float, y1: float, x2: float, y2: float) -> Callable[[float], float]:
+    """Build a linear function given two points on the line (x1, y1) and (x2, y2)."""
+    return lambda x: (y2 - y1) / (x2 - x1) * (x - x1) + y1
@@ -1040,6 +1040,7 @@
         "strength": "Strength",
         "symmetry": "Symmetry",
         "tileSize": "Tile Size",
+        "optimizedDenoising": "Optimized Denoising",
         "type": "Type",
         "postProcessing": "Post-Processing (Shift + U)",
         "processImage": "Process Image",
@@ -1539,6 +1540,12 @@
             "paragraphs": [
                 "Structure controls how closely the output image will keep to the layout of the original. Low structure allows major changes, while high structure strictly maintains the original composition and layout."
             ]
+        },
+        "optimizedDenoising": {
+            "heading": "Optimized Denoising",
+            "paragraphs": [
+                "Enable optimized denoising for enhanced image-to-image transformations with Flux models. This setting improves detail and clarity during generation, but may be turned off to preserve more of your original image."
+            ]
         }
     },
     "unifiedCanvas": {

@@ -58,7 +58,8 @@ export type Feature =
   | 'upscaleModel'
   | 'scale'
   | 'creativity'
-  | 'structure';
+  | 'structure'
+  | 'optimizedDenoising';
 
 export type PopoverData = PopoverProps & {
   image?: string;

@@ -40,6 +40,7 @@ export type ParamsState = {
   cfgRescaleMultiplier: ParameterCFGRescaleMultiplier;
   guidance: ParameterGuidance;
   img2imgStrength: ParameterStrength;
+  optimizedDenoisingEnabled: boolean;
   iterations: number;
   scheduler: ParameterScheduler;
   seed: ParameterSeed;
@@ -83,6 +84,7 @@ const initialState: ParamsState = {
   cfgRescaleMultiplier: 0,
   guidance: 4,
   img2imgStrength: 0.75,
+  optimizedDenoisingEnabled: true,
   iterations: 1,
   scheduler: 'euler',
   seed: 0,
@@ -141,6 +143,9 @@ export const paramsSlice = createSlice({
     setImg2imgStrength: (state, action: PayloadAction<number>) => {
       state.img2imgStrength = action.payload;
     },
+    setOptimizedDenoisingEnabled: (state, action: PayloadAction<boolean>) => {
+      state.optimizedDenoisingEnabled = action.payload;
+    },
     setSeamlessXAxis: (state, action: PayloadAction<boolean>) => {
       state.seamlessXAxis = action.payload;
     },
@@ -273,6 +278,7 @@ export const {
   setScheduler,
   setSeed,
   setImg2imgStrength,
+  setOptimizedDenoisingEnabled,
   setSeamlessXAxis,
   setSeamlessYAxis,
   setShouldRandomizeSeed,
@@ -341,6 +347,7 @@ export const selectInfillPatchmatchDownscaleSize = createParamsSelector(
 );
 export const selectInfillColorValue = createParamsSelector((params) => params.infillColorValue);
 export const selectImg2imgStrength = createParamsSelector((params) => params.img2imgStrength);
+export const selectOptimizedDenoisingEnabled = createParamsSelector((params) => params.optimizedDenoisingEnabled);
 export const selectPositivePrompt = createParamsSelector((params) => params.positivePrompt);
 export const selectNegativePrompt = createParamsSelector((params) => params.negativePrompt);
 export const selectPositivePrompt2 = createParamsSelector((params) => params.positivePrompt2);

@@ -37,7 +37,17 @@ export const buildFLUXGraph = async (
 
   const { originalSize, scaledSize } = getSizes(bbox);
 
-  const { model, guidance, seed, steps, fluxVAE, t5EncoderModel, clipEmbedModel, img2imgStrength } = params;
+  const {
+    model,
+    guidance,
+    seed,
+    steps,
+    fluxVAE,
+    t5EncoderModel,
+    clipEmbedModel,
+    img2imgStrength,
+    optimizedDenoisingEnabled,
+  } = params;
 
   assert(model, 'No model found in state');
   assert(t5EncoderModel, 'No T5 Encoder model found in state');
@@ -68,7 +78,8 @@ export const buildFLUXGraph = async (
     guidance,
     num_steps: steps,
     seed,
-    denoising_start: 0, // denoising_start should be 0 when latents are not provided
+    trajectory_guidance_strength: 0,
+    denoising_start: 0,
     denoising_end: 1,
     width: scaledSize.width,
     height: scaledSize.height,
@@ -113,6 +124,8 @@ export const buildFLUXGraph = async (
     clip_embed_model: clipEmbedModel,
   });
 
+  const denoisingStart = 1 - img2imgStrength;
+
   if (generationMode === 'txt2img') {
     canvasOutput = addTextToImage(g, l2i, originalSize, scaledSize);
   } else if (generationMode === 'img2img') {
@@ -125,9 +138,15 @@ export const buildFLUXGraph = async (
       originalSize,
       scaledSize,
       bbox,
-      1 - img2imgStrength,
+      denoisingStart,
       false
     );
+    if (optimizedDenoisingEnabled) {
+      g.updateNode(noise, {
+        denoising_start: 0,
+        trajectory_guidance_strength: img2imgStrength,
+      });
+    }
   } else if (generationMode === 'inpaint') {
     canvasOutput = await addInpaint(
       state,
@@ -139,9 +158,15 @@ export const buildFLUXGraph = async (
       modelLoader,
       originalSize,
       scaledSize,
-      1 - img2imgStrength,
+      denoisingStart,
       false
     );
+    if (optimizedDenoisingEnabled) {
+      g.updateNode(noise, {
+        denoising_start: 0,
+        trajectory_guidance_strength: img2imgStrength,
+      });
+    }
   } else if (generationMode === 'outpaint') {
     canvasOutput = await addOutpaint(
       state,
@@ -153,9 +178,15 @@ export const buildFLUXGraph = async (
       modelLoader,
       originalSize,
       scaledSize,
-      1 - img2imgStrength,
+      denoisingStart,
       false
     );
+    if (optimizedDenoisingEnabled) {
+      g.updateNode(noise, {
+        denoising_start: 0,
+        trajectory_guidance_strength: img2imgStrength,
+      });
+    }
   }
 
   if (state.system.shouldUseNSFWChecker) {