Skip to content
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 13 additions & 8 deletions invokeai/app/invocations/flux_denoise.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from invokeai.app.invocations.primitives import LatentsOutput
from invokeai.app.services.shared.invocation_context import InvocationContext
from invokeai.backend.flux.denoise import denoise
from invokeai.backend.flux.inpaint_extension import InpaintExtension
from invokeai.backend.flux.model import Flux
from invokeai.backend.flux.sampling_utils import (
clip_timestep_schedule,
Expand All @@ -30,6 +29,7 @@
pack,
unpack,
)
from invokeai.backend.flux.trajectory_guidance_extension import TrajectoryGuidanceExtension
from invokeai.backend.lora.lora_model_raw import LoRAModelRaw
from invokeai.backend.lora.lora_patcher import LoRAPatcher
from invokeai.backend.model_manager.config import ModelFormat
Expand Down Expand Up @@ -68,6 +68,12 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard):
description=FieldDescriptions.denoising_start,
)
denoising_end: float = InputField(default=1.0, ge=0, le=1, description=FieldDescriptions.denoising_end)
trajectory_guidance_strength: float = InputField(
default=0.0,
ge=0.0,
le=1.0,
description="Value indicating how strongly to guide the denoising process towards the initial latents (during image-to-image). Range [0, 1]. A value of 0.0 is equivalent to vanilla image-to-image. A value of 1.0 will guide the denoising process very close to the original latents.",
)
transformer: TransformerField = InputField(
description=FieldDescriptions.flux_model,
input=Input.Connection,
Expand Down Expand Up @@ -181,14 +187,13 @@ def _run_diffusion(
# Now that we have 'packed' the latent tensors, verify that we calculated the image_seq_len correctly.
assert image_seq_len == x.shape[1]

# Prepare inpaint extension.
inpaint_extension: InpaintExtension | None = None
if inpaint_mask is not None:
assert init_latents is not None
inpaint_extension = InpaintExtension(
# Prepare trajectory guidance extension.
traj_guidance_extension: TrajectoryGuidanceExtension | None = None
if init_latents is not None:
traj_guidance_extension = TrajectoryGuidanceExtension(
init_latents=init_latents,
inpaint_mask=inpaint_mask,
noise=noise,
trajectory_guidance_strength=self.trajectory_guidance_strength,
)

with (
Expand Down Expand Up @@ -236,7 +241,7 @@ def _run_diffusion(
timesteps=timesteps,
step_callback=self._build_step_callback(context),
guidance=self.guidance,
inpaint_extension=inpaint_extension,
traj_guidance_extension=traj_guidance_extension,
)

x = unpack(x.float(), self.height, self.width)
Expand Down
16 changes: 9 additions & 7 deletions invokeai/backend/flux/denoise.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
import torch
from tqdm import tqdm

from invokeai.backend.flux.inpaint_extension import InpaintExtension
from invokeai.backend.flux.model import Flux
from invokeai.backend.flux.trajectory_guidance_extension import TrajectoryGuidanceExtension
from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState


Expand All @@ -20,7 +20,7 @@ def denoise(
timesteps: list[float],
step_callback: Callable[[PipelineIntermediateState], None],
guidance: float,
inpaint_extension: InpaintExtension | None,
traj_guidance_extension: TrajectoryGuidanceExtension | None, # noqa: F821
):
step = 0
# guidance_vec is ignored for schnell.
Expand All @@ -36,12 +36,14 @@ def denoise(
timesteps=t_vec,
guidance=guidance_vec,
)
preview_img = img - t_curr * pred
img = img + (t_prev - t_curr) * pred

if inpaint_extension is not None:
img = inpaint_extension.merge_intermediate_latents_with_init_latents(img, t_prev)
preview_img = inpaint_extension.merge_intermediate_latents_with_init_latents(preview_img, 0.0)
if traj_guidance_extension is not None:
img = traj_guidance_extension.step(t_curr_latents=img, pred_noise=pred, t_curr=t_curr, t_prev=t_prev)
# TODO(ryand): Generate a better preview image.
preview_img = img
else:
preview_img = img - t_curr * pred
img = img + (t_prev - t_curr) * pred

step_callback(
PipelineIntermediateState(
Expand Down
125 changes: 125 additions & 0 deletions invokeai/backend/flux/trajectory_guidance_extension.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import torch

from invokeai.backend.util.build_line import build_line


class TrajectoryGuidanceExtension:
"""An implementation of trajectory guidance for FLUX.
What is trajectory guidance?
----------------------------
With SD 1 and SDXL, the amount of change in image-to-image denoising is largely controlled by the denoising_start
parameter. Doing the same thing with the FLUX model does not work as well, because the FLUX model converges very
quickly (roughly time 1.0 to 0.9) to the structure of the final image. The result of this model characteristic is
that you typically get one of two outcomes:
1) a result that is very similar to the original image
2) a result that is very different from the original image, as though it was generated from the text prompt with
pure noise.
To address this issue with image-to-image workflows with FLUX, we employ the concept of trajectory guidance. The
idea is that in addition to controlling the denoising_start parameter (i.e. the amount of noise added to the
original image), we can also guide the denoising process to stay close to the trajectory that would reproduce the
original. By controlling the strength of the trajectory guidance throughout the denoising process, we can achieve
FLUX image-to-image behavior with the same level of control offered by SD1 and SDXL.
What is the trajectory_guidance_strength?
-----------------------------------------
In the limit, we could apply a different trajectory guidance 'strength' for every latent value in every timestep.
This would be impractical for a user, so instead we have engineered a strength schedule that is more convenient to
use. The `trajectory_guidance_strength` parameter is a single scalar value that maps to a schedule. The engineered
schedule is defined as:
1) An initial change_ratio at t=1.0.
2) A linear ramp up to change_ratio=1.0 at t = t_cutoff.
3) A constant change_ratio=1.0 after t = t_cutoff.
"""

def __init__(
self, init_latents: torch.Tensor, inpaint_mask: torch.Tensor | None, trajectory_guidance_strength: float
):
"""Initialize TrajectoryGuidanceExtension.
Args:
init_latents (torch.Tensor): The initial latents (i.e. un-noised at timestep 0). In 'packed' format.
inpaint_mask (torch.Tensor | None): A mask specifying which elements to inpaint. Range [0, 1]. Values of 1
will be re-generated. Values of 0 will remain unchanged. Values between 0 and 1 can be used to blend the
inpainted region with the background. In 'packed' format. If None, will be treated as a mask of all 1s.
trajectory_guidance_strength (float): A value in [0, 1] specifying the strength of the trajectory guidance.
A value of 0.0 is equivalent to vanilla image-to-image. A value of 1.0 will guide the denoising process
very close to the original latents.
"""
assert 0.0 <= trajectory_guidance_strength <= 1.0
self._init_latents = init_latents
self._trajectory_guidance_strength = trajectory_guidance_strength
if inpaint_mask is None:
# The inpaing mask is None, so we initialize a mask with a single value of 1.0.
# This value will be broadcasted and treated as a mask of all 1s.
self._inpaint_mask = torch.ones(1, device=init_latents.device, dtype=init_latents.dtype)
else:
self._inpaint_mask = inpaint_mask

def _apply_mask_gradient_adjustment(self, t_prev: float) -> torch.Tensor:
"""Applies inpaint mask gradient adjustment and returns the inpaint mask to be used at the current timestep."""
# As we progress through the denoising process, we promote gradient regions of the mask to have a full weight of
# 1.0. This helps to produce more coherent seams around the inpainted region. We experimented with a (small)
# number of promotion strategies (e.g. gradual promotion based on timestep), but found that a simple cutoff
# threshold worked well.
# We use a small epsilon to avoid any potential issues with floating point precision.
eps = 1e-4
mask_gradient_t_cutoff = 0.5
if t_prev > mask_gradient_t_cutoff:
# Early in the denoising process, use the inpaint mask as-is.
return self._inpaint_mask
else:
# After the cut-off, promote all non-zero mask values to 1.0.
mask = self._inpaint_mask.where(self._inpaint_mask <= (0.0 + eps), 1.0)

return mask

def step(
self, t_curr_latents: torch.Tensor, pred_noise: torch.Tensor, t_curr: float, t_prev: float
) -> torch.Tensor:
# Handle gradient cutoff.
mask = self._apply_mask_gradient_adjustment(t_prev)

# Calculate the change_ratio based on the trajectory_guidance_strength.
# These mappings from trajectory_guidance_strength have no theoretical basis - they were tuned manually.
change_ratio_at_t_1 = build_line(x1=0.0, y1=1.0, x2=1.0, y2=0.0)(self._trajectory_guidance_strength)
change_ratio_at_cutoff = 1.0
t_cutoff = build_line(x1=0.0, y1=1.0, x2=1.0, y2=0.5)(self._trajectory_guidance_strength)
change_ratio = 1.0
if t_prev > t_cutoff:
# If we are before the cutoff, linearly interpolate between the change_ratio at t=1.0 and the change_ratio
# at the cutoff.
change_ratio = build_line(x1=1.0, y1=change_ratio_at_t_1, x2=t_cutoff, y2=change_ratio_at_cutoff)(t_prev)

mask = mask * change_ratio

# NOTE(ryand): During inpainting, it is common to guide the denoising process by noising the initial latents for
# the current timestep and then blending the predicted intermediate latents with the noised initial latents.
# For example:
# ```
# noised_init_latents = self._noise * t_prev + (1.0 - t_prev) * self._init_latents
# return t_prev_latents * self._inpaint_mask + noised_init_latents * (1.0 - self._inpaint_mask)
# ```
# Instead of guiding based on the noised initial latents, we have decided to guide based on the noise prediction
# that points towards the initial latents. The difference between these guidance strategies is minor, but
# qualitatively we found the latter to produce slightly better results. When change_ratio is 0.0 or 1.0 there is
# no difference between the two strategies.
#
# We experimented with a number of related guidance strategies, but not exhaustively. It's entirely possible
# that there's a much better way to do this.

# Calculate noise guidance
# What noise should the model have predicted at this timestep to step towards self._init_latents?
# Derivation:
# > t_prev_latents = t_curr_latents + (t_prev - t_curr) * pred_noise
# > t_0_latents = t_curr_latents + (0 - t_curr) * init_traj_noise
# > t_0_latents = t_curr_latents - t_curr * init_traj_noise
# > init_traj_noise = (t_curr_latents - t_0_latents) / t_curr)
init_traj_noise = (t_curr_latents - self._init_latents) / t_curr

# Blend the init_traj_noise with the pred_noise according to the inpaint mask and the trajectory guidance.
noise = pred_noise * mask + init_traj_noise * (1.0 - mask)

# Take a denoising step.
return t_curr_latents + (t_prev - t_curr) * noise
6 changes: 6 additions & 0 deletions invokeai/backend/util/build_line.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from typing import Callable


def build_line(x1: float, y1: float, x2: float, y2: float) -> Callable[[float], float]:
"""Build a linear function given two points on the line (x1, y1) and (x2, y2)."""
return lambda x: (y2 - y1) / (x2 - x1) * (x - x1) + y1
7 changes: 7 additions & 0 deletions invokeai/frontend/web/public/locales/en.json
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,7 @@
"strength": "Strength",
"symmetry": "Symmetry",
"tileSize": "Tile Size",
"optimizedDenoising": "Optimized Denoising",
"type": "Type",
"postProcessing": "Post-Processing (Shift + U)",
"processImage": "Process Image",
Expand Down Expand Up @@ -1539,6 +1540,12 @@
"paragraphs": [
"Structure controls how closely the output image will keep to the layout of the original. Low structure allows major changes, while high structure strictly maintains the original composition and layout."
]
},
"optimizedDenoising": {
"heading": "Optimized Denoising",
"paragraphs": [
"Enable optimized denoising for enhanced image-to-image transformations with Flux models. This setting improves detail and clarity during generation, but may be turned off to preserve more of your original image."
]
}
},
"unifiedCanvas": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,8 @@ export type Feature =
| 'upscaleModel'
| 'scale'
| 'creativity'
| 'structure';
| 'structure'
| 'optimizedDenoising';

export type PopoverData = PopoverProps & {
image?: string;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ export type ParamsState = {
cfgRescaleMultiplier: ParameterCFGRescaleMultiplier;
guidance: ParameterGuidance;
img2imgStrength: ParameterStrength;
optimizedDenoisingEnabled: boolean;
iterations: number;
scheduler: ParameterScheduler;
seed: ParameterSeed;
Expand Down Expand Up @@ -83,6 +84,7 @@ const initialState: ParamsState = {
cfgRescaleMultiplier: 0,
guidance: 4,
img2imgStrength: 0.75,
optimizedDenoisingEnabled: true,
iterations: 1,
scheduler: 'euler',
seed: 0,
Expand Down Expand Up @@ -141,6 +143,9 @@ export const paramsSlice = createSlice({
setImg2imgStrength: (state, action: PayloadAction<number>) => {
state.img2imgStrength = action.payload;
},
setOptimizedDenoisingEnabled: (state, action: PayloadAction<boolean>) => {
state.optimizedDenoisingEnabled = action.payload;
},
setSeamlessXAxis: (state, action: PayloadAction<boolean>) => {
state.seamlessXAxis = action.payload;
},
Expand Down Expand Up @@ -273,6 +278,7 @@ export const {
setScheduler,
setSeed,
setImg2imgStrength,
setOptimizedDenoisingEnabled,
setSeamlessXAxis,
setSeamlessYAxis,
setShouldRandomizeSeed,
Expand Down Expand Up @@ -341,6 +347,7 @@ export const selectInfillPatchmatchDownscaleSize = createParamsSelector(
);
export const selectInfillColorValue = createParamsSelector((params) => params.infillColorValue);
export const selectImg2imgStrength = createParamsSelector((params) => params.img2imgStrength);
export const selectOptimizedDenoisingEnabled = createParamsSelector((params) => params.optimizedDenoisingEnabled);
export const selectPositivePrompt = createParamsSelector((params) => params.positivePrompt);
export const selectNegativePrompt = createParamsSelector((params) => params.negativePrompt);
export const selectPositivePrompt2 = createParamsSelector((params) => params.positivePrompt2);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,17 @@ export const buildFLUXGraph = async (

const { originalSize, scaledSize } = getSizes(bbox);

const { model, guidance, seed, steps, fluxVAE, t5EncoderModel, clipEmbedModel, img2imgStrength } = params;
const {
model,
guidance,
seed,
steps,
fluxVAE,
t5EncoderModel,
clipEmbedModel,
img2imgStrength,
optimizedDenoisingEnabled,
} = params;

assert(model, 'No model found in state');
assert(t5EncoderModel, 'No T5 Encoder model found in state');
Expand Down Expand Up @@ -68,7 +78,8 @@ export const buildFLUXGraph = async (
guidance,
num_steps: steps,
seed,
denoising_start: 0, // denoising_start should be 0 when latents are not provided
trajectory_guidance_strength: 0,
denoising_start: 0,
denoising_end: 1,
width: scaledSize.width,
height: scaledSize.height,
Expand Down Expand Up @@ -113,6 +124,8 @@ export const buildFLUXGraph = async (
clip_embed_model: clipEmbedModel,
});

const denoisingStart = 1 - img2imgStrength;

if (generationMode === 'txt2img') {
canvasOutput = addTextToImage(g, l2i, originalSize, scaledSize);
} else if (generationMode === 'img2img') {
Expand All @@ -125,9 +138,15 @@ export const buildFLUXGraph = async (
originalSize,
scaledSize,
bbox,
1 - img2imgStrength,
denoisingStart,
false
);
if (optimizedDenoisingEnabled) {
g.updateNode(noise, {
denoising_start: 0,
trajectory_guidance_strength: img2imgStrength,
});
}
} else if (generationMode === 'inpaint') {
canvasOutput = await addInpaint(
state,
Expand All @@ -139,9 +158,15 @@ export const buildFLUXGraph = async (
modelLoader,
originalSize,
scaledSize,
1 - img2imgStrength,
denoisingStart,
false
);
if (optimizedDenoisingEnabled) {
g.updateNode(noise, {
denoising_start: 0,
trajectory_guidance_strength: img2imgStrength,
});
}
} else if (generationMode === 'outpaint') {
canvasOutput = await addOutpaint(
state,
Expand All @@ -153,9 +178,15 @@ export const buildFLUXGraph = async (
modelLoader,
originalSize,
scaledSize,
1 - img2imgStrength,
denoisingStart,
false
);
if (optimizedDenoisingEnabled) {
g.updateNode(noise, {
denoising_start: 0,
trajectory_guidance_strength: img2imgStrength,
});
}
}

if (state.system.shouldUseNSFWChecker) {
Expand Down
Loading