From 73d4c4d56ddc323f54342cf9b4a9651b7505b987 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Wed, 16 Oct 2024 16:22:35 +0000 Subject: [PATCH 01/30] Naive implementation of CFG for FLUX. --- invokeai/app/invocations/flux_denoise.py | 54 ++++++++++++++++++------ invokeai/backend/flux/denoise.py | 22 ++++++++++ 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py index f075ea8c9db..7fba862a455 100644 --- a/invokeai/app/invocations/flux_denoise.py +++ b/invokeai/app/invocations/flux_denoise.py @@ -49,7 +49,7 @@ title="FLUX Denoise", tags=["image", "flux"], category="image", - version="3.1.0", + version="3.2.0", classification=Classification.Prototype, ) class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): @@ -82,6 +82,12 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): positive_text_conditioning: FluxConditioningField = InputField( description=FieldDescriptions.positive_cond, input=Input.Connection ) + negative_text_conditioning: FluxConditioningField = InputField( + description=FieldDescriptions.negative_cond, input=Input.Connection + ) + # TODO(ryand): Add support for cfg_scale to be a list of floats: one for each step. + # TODO(ryand): Add cfg_scale range validation. + cfg_scale: float = InputField(default=3.0, description=FieldDescriptions.cfg_scale, title="CFG Scale") width: int = InputField(default=1024, multiple_of=16, description="Width of the generated image.") height: int = InputField(default=1024, multiple_of=16, description="Height of the generated image.") num_steps: int = InputField( @@ -108,6 +114,19 @@ def invoke(self, context: InvocationContext) -> LatentsOutput: name = context.tensors.save(tensor=latents) return LatentsOutput.build(latents_name=name, latents=latents, seed=None) + def _load_text_conditioning( + self, context: InvocationContext, conditioning_name: str, dtype: torch.dtype + ) -> Tuple[torch.Tensor, torch.Tensor]: + # Load the conditioning data. + cond_data = context.conditioning.load(conditioning_name) + assert len(cond_data.conditionings) == 1 + flux_conditioning = cond_data.conditionings[0] + assert isinstance(flux_conditioning, FLUXConditioningInfo) + flux_conditioning = flux_conditioning.to(dtype=dtype) + t5_embeddings = flux_conditioning.t5_embeds + clip_embeddings = flux_conditioning.clip_embeds + return t5_embeddings, clip_embeddings + def _run_diffusion( self, context: InvocationContext, @@ -115,13 +134,12 @@ def _run_diffusion( inference_dtype = torch.bfloat16 # Load the conditioning data. - cond_data = context.conditioning.load(self.positive_text_conditioning.conditioning_name) - assert len(cond_data.conditionings) == 1 - flux_conditioning = cond_data.conditionings[0] - assert isinstance(flux_conditioning, FLUXConditioningInfo) - flux_conditioning = flux_conditioning.to(dtype=inference_dtype) - t5_embeddings = flux_conditioning.t5_embeds - clip_embeddings = flux_conditioning.clip_embeds + pos_t5_embeddings, pos_clip_embeddings = self._load_text_conditioning( + context, self.positive_text_conditioning.conditioning_name, inference_dtype + ) + neg_t5_embeddings, neg_clip_embeddings = self._load_text_conditioning( + context, self.negative_text_conditioning.conditioning_name, inference_dtype + ) # Load the input latents, if provided. init_latents = context.tensors.load(self.latents.latents_name) if self.latents else None @@ -182,8 +200,14 @@ def _run_diffusion( b, _c, latent_h, latent_w = x.shape img_ids = generate_img_ids(h=latent_h, w=latent_w, batch_size=b, device=x.device, dtype=x.dtype) - bs, t5_seq_len, _ = t5_embeddings.shape - txt_ids = torch.zeros(bs, t5_seq_len, 3, dtype=inference_dtype, device=TorchDevice.choose_torch_device()) + pos_bs, pos_t5_seq_len, _ = pos_t5_embeddings.shape + pos_txt_ids = torch.zeros( + pos_bs, pos_t5_seq_len, 3, dtype=inference_dtype, device=TorchDevice.choose_torch_device() + ) + neg_bs, neg_t5_seq_len, _ = neg_t5_embeddings.shape + neg_txt_ids = torch.zeros( + neg_bs, neg_t5_seq_len, 3, dtype=inference_dtype, device=TorchDevice.choose_torch_device() + ) # Pack all latent tensors. init_latents = pack(init_latents) if init_latents is not None else None @@ -256,12 +280,16 @@ def _run_diffusion( model=transformer, img=x, img_ids=img_ids, - txt=t5_embeddings, - txt_ids=txt_ids, - vec=clip_embeddings, + txt=pos_t5_embeddings, + txt_ids=pos_txt_ids, + vec=pos_clip_embeddings, + neg_txt=neg_t5_embeddings, + neg_txt_ids=neg_txt_ids, + neg_vec=neg_clip_embeddings, timesteps=timesteps, step_callback=self._build_step_callback(context), guidance=self.guidance, + cfg_scale=self.cfg_scale, inpaint_extension=inpaint_extension, controlnet_extensions=controlnet_extensions, ) diff --git a/invokeai/backend/flux/denoise.py b/invokeai/backend/flux/denoise.py index 14fafb6e1d8..b524d67e7cd 100644 --- a/invokeai/backend/flux/denoise.py +++ b/invokeai/backend/flux/denoise.py @@ -16,13 +16,19 @@ def denoise( # model input img: torch.Tensor, img_ids: torch.Tensor, + # positive text conditioning txt: torch.Tensor, txt_ids: torch.Tensor, vec: torch.Tensor, + # negative text conditioning + neg_txt: torch.Tensor, + neg_txt_ids: torch.Tensor, + neg_vec: torch.Tensor, # sampling parameters timesteps: list[float], step_callback: Callable[[PipelineIntermediateState], None], guidance: float, + cfg_scale: float, inpaint_extension: InpaintExtension | None, controlnet_extensions: list[XLabsControlNetExtension | InstantXControlNetExtension], ): @@ -78,6 +84,22 @@ def denoise( controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals, ) + # TODO(ryand): Add option to apply controlnet to negative conditioning as well. + # TODO(ryand): Add option to run positive and negative predictions in a single batch for better performance on + # systems with sufficient VRAM. + neg_pred = model( + img=img, + img_ids=img_ids, + txt=neg_txt, + txt_ids=neg_txt_ids, + y=neg_vec, + timesteps=t_vec, + guidance=guidance_vec, + controlnet_double_block_residuals=None, + controlnet_single_block_residuals=None, + ) + pred = neg_pred + cfg_scale * (pred - neg_pred) + preview_img = img - t_curr * pred img = img + (t_prev - t_curr) * pred From 371742d8f91208984746e3612b8f4d52ed636dda Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Fri, 18 Oct 2024 20:14:47 +0000 Subject: [PATCH 02/30] Add support for cfg_scale list on FLUX Denoise node. --- invokeai/app/invocations/flux_denoise.py | 3 +- invokeai/backend/flux/denoise.py | 44 +++++++++++++----------- 2 files changed, 24 insertions(+), 23 deletions(-) diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py index 7fba862a455..e87c2ff3de9 100644 --- a/invokeai/app/invocations/flux_denoise.py +++ b/invokeai/app/invocations/flux_denoise.py @@ -85,9 +85,8 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): negative_text_conditioning: FluxConditioningField = InputField( description=FieldDescriptions.negative_cond, input=Input.Connection ) - # TODO(ryand): Add support for cfg_scale to be a list of floats: one for each step. # TODO(ryand): Add cfg_scale range validation. - cfg_scale: float = InputField(default=3.0, description=FieldDescriptions.cfg_scale, title="CFG Scale") + cfg_scale: float | list[float] = InputField(default=1.0, description=FieldDescriptions.cfg_scale, title="CFG Scale") width: int = InputField(default=1024, multiple_of=16, description="Width of the generated image.") height: int = InputField(default=1024, multiple_of=16, description="Height of the generated image.") num_steps: int = InputField( diff --git a/invokeai/backend/flux/denoise.py b/invokeai/backend/flux/denoise.py index b524d67e7cd..bcdb15a18f9 100644 --- a/invokeai/backend/flux/denoise.py +++ b/invokeai/backend/flux/denoise.py @@ -1,3 +1,4 @@ +import math from typing import Callable import torch @@ -28,7 +29,7 @@ def denoise( timesteps: list[float], step_callback: Callable[[PipelineIntermediateState], None], guidance: float, - cfg_scale: float, + cfg_scale: float | list[float], inpaint_extension: InpaintExtension | None, controlnet_extensions: list[XLabsControlNetExtension | InstantXControlNetExtension], ): @@ -43,10 +44,9 @@ def denoise( latents=img, ), ) - step = 1 # guidance_vec is ignored for schnell. guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype) - for t_curr, t_prev in tqdm(list(zip(timesteps[:-1], timesteps[1:], strict=True))): + for step_index, (t_curr, t_prev) in tqdm(list(enumerate(zip(timesteps[:-1], timesteps[1:], strict=True)))): t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device) # Run ControlNet models. @@ -54,7 +54,7 @@ def denoise( for controlnet_extension in controlnet_extensions: controlnet_residuals.append( controlnet_extension.run_controlnet( - timestep_index=step - 1, + timestep_index=step_index, total_num_timesteps=total_steps, img=img, img_ids=img_ids, @@ -84,21 +84,24 @@ def denoise( controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals, ) - # TODO(ryand): Add option to apply controlnet to negative conditioning as well. - # TODO(ryand): Add option to run positive and negative predictions in a single batch for better performance on - # systems with sufficient VRAM. - neg_pred = model( - img=img, - img_ids=img_ids, - txt=neg_txt, - txt_ids=neg_txt_ids, - y=neg_vec, - timesteps=t_vec, - guidance=guidance_vec, - controlnet_double_block_residuals=None, - controlnet_single_block_residuals=None, - ) - pred = neg_pred + cfg_scale * (pred - neg_pred) + step_cfg_scale = cfg_scale[step_index] if isinstance(cfg_scale, list) else cfg_scale + + # If step_cfg_scale, is 1.0, then we don't need to run the negative prediction. + if not math.isclose(step_cfg_scale, 1.0): + # TODO(ryand): Add option to run positive and negative predictions in a single batch for better performance on + # systems with sufficient VRAM. + neg_pred = model( + img=img, + img_ids=img_ids, + txt=neg_txt, + txt_ids=neg_txt_ids, + y=neg_vec, + timesteps=t_vec, + guidance=guidance_vec, + controlnet_double_block_residuals=None, + controlnet_single_block_residuals=None, + ) + pred = neg_pred + step_cfg_scale * (pred - neg_pred) preview_img = img - t_curr * pred img = img + (t_prev - t_curr) * pred @@ -109,13 +112,12 @@ def denoise( step_callback( PipelineIntermediateState( - step=step, + step=step_index + 1, order=1, total_steps=total_steps, timestep=int(t_curr), latents=preview_img, ), ) - step += 1 return img From 6df4ee5fc8bcb2317361e9b01de4d1d05186e6af Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Fri, 18 Oct 2024 20:31:27 +0000 Subject: [PATCH 03/30] Make negative_text_conditioning nullable on FLUX Denoise invocation. --- invokeai/app/invocations/flux_denoise.py | 25 +++++++++++++++--------- invokeai/backend/flux/denoise.py | 10 +++++++--- 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py index e87c2ff3de9..81e2f28a4fe 100644 --- a/invokeai/app/invocations/flux_denoise.py +++ b/invokeai/app/invocations/flux_denoise.py @@ -82,8 +82,10 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): positive_text_conditioning: FluxConditioningField = InputField( description=FieldDescriptions.positive_cond, input=Input.Connection ) - negative_text_conditioning: FluxConditioningField = InputField( - description=FieldDescriptions.negative_cond, input=Input.Connection + negative_text_conditioning: FluxConditioningField | None = InputField( + default=None, + description="Negative conditioning tensor. Can be None if cfg_scale is 1.0.", + input=Input.Connection, ) # TODO(ryand): Add cfg_scale range validation. cfg_scale: float | list[float] = InputField(default=1.0, description=FieldDescriptions.cfg_scale, title="CFG Scale") @@ -136,9 +138,12 @@ def _run_diffusion( pos_t5_embeddings, pos_clip_embeddings = self._load_text_conditioning( context, self.positive_text_conditioning.conditioning_name, inference_dtype ) - neg_t5_embeddings, neg_clip_embeddings = self._load_text_conditioning( - context, self.negative_text_conditioning.conditioning_name, inference_dtype - ) + neg_t5_embeddings: torch.Tensor | None = None + neg_clip_embeddings: torch.Tensor | None = None + if self.negative_text_conditioning is not None: + neg_t5_embeddings, neg_clip_embeddings = self._load_text_conditioning( + context, self.negative_text_conditioning.conditioning_name, inference_dtype + ) # Load the input latents, if provided. init_latents = context.tensors.load(self.latents.latents_name) if self.latents else None @@ -203,10 +208,12 @@ def _run_diffusion( pos_txt_ids = torch.zeros( pos_bs, pos_t5_seq_len, 3, dtype=inference_dtype, device=TorchDevice.choose_torch_device() ) - neg_bs, neg_t5_seq_len, _ = neg_t5_embeddings.shape - neg_txt_ids = torch.zeros( - neg_bs, neg_t5_seq_len, 3, dtype=inference_dtype, device=TorchDevice.choose_torch_device() - ) + neg_txt_ids: torch.Tensor | None = None + if neg_t5_embeddings is not None: + neg_bs, neg_t5_seq_len, _ = neg_t5_embeddings.shape + neg_txt_ids = torch.zeros( + neg_bs, neg_t5_seq_len, 3, dtype=inference_dtype, device=TorchDevice.choose_torch_device() + ) # Pack all latent tensors. init_latents = pack(init_latents) if init_latents is not None else None diff --git a/invokeai/backend/flux/denoise.py b/invokeai/backend/flux/denoise.py index bcdb15a18f9..92811f76f64 100644 --- a/invokeai/backend/flux/denoise.py +++ b/invokeai/backend/flux/denoise.py @@ -22,9 +22,9 @@ def denoise( txt_ids: torch.Tensor, vec: torch.Tensor, # negative text conditioning - neg_txt: torch.Tensor, - neg_txt_ids: torch.Tensor, - neg_vec: torch.Tensor, + neg_txt: torch.Tensor | None, + neg_txt_ids: torch.Tensor | None, + neg_vec: torch.Tensor | None, # sampling parameters timesteps: list[float], step_callback: Callable[[PipelineIntermediateState], None], @@ -90,6 +90,10 @@ def denoise( if not math.isclose(step_cfg_scale, 1.0): # TODO(ryand): Add option to run positive and negative predictions in a single batch for better performance on # systems with sufficient VRAM. + + if neg_txt is None or neg_txt_ids is None or neg_vec is None: + raise ValueError("Negative text conditioning is required when cfg_scale is not 1.0.") + neg_pred = model( img=img, img_ids=img_ids, From 32c7cdd856520aee9f22195406e45289377d2d7d Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Mon, 21 Oct 2024 14:52:02 +0000 Subject: [PATCH 04/30] Add cfg_scale_start_step and cfg_scale_end_step to FLUX Denoise node. --- invokeai/app/invocations/flux_denoise.py | 71 +++++++++++++++++++++- invokeai/backend/flux/denoise.py | 4 +- tests/app/invocations/test_flux_denoise.py | 62 +++++++++++++++++++ 3 files changed, 133 insertions(+), 4 deletions(-) create mode 100644 tests/app/invocations/test_flux_denoise.py diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py index 81e2f28a4fe..8120ac400f5 100644 --- a/invokeai/app/invocations/flux_denoise.py +++ b/invokeai/app/invocations/flux_denoise.py @@ -87,8 +87,19 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): description="Negative conditioning tensor. Can be None if cfg_scale is 1.0.", input=Input.Connection, ) - # TODO(ryand): Add cfg_scale range validation. cfg_scale: float | list[float] = InputField(default=1.0, description=FieldDescriptions.cfg_scale, title="CFG Scale") + cfg_scale_start_step: int = InputField( + default=0, + title="CFG Scale Start Step", + description="Index of the first step to apply cfg_scale. Negative indices count backwards from the " + + "the last step (e.g. a value of -1 refers to the final step).", + ) + cfg_scale_end_step: int = InputField( + default=-1, + title="CFG Scale End Step", + description="Index of the last step to apply cfg_scale. Negative indices count backwards from the " + + "last step (e.g. a value of -1 refers to the final step).", + ) width: int = InputField(default=1024, multiple_of=16, description="Width of the generated image.") height: int = InputField(default=1024, multiple_of=16, description="Height of the generated image.") num_steps: int = InputField( @@ -234,6 +245,13 @@ def _run_diffusion( noise=noise, ) + cfg_scale = self.prep_cfg_scale( + cfg_scale=self.cfg_scale, + timesteps=timesteps, + cfg_scale_start_step=self.cfg_scale_start_step, + cfg_scale_end_step=self.cfg_scale_end_step, + ) + with ExitStack() as exit_stack: # Prepare ControlNet extensions. # Note: We do this before loading the transformer model to minimize peak memory (see implementation). @@ -295,7 +313,7 @@ def _run_diffusion( timesteps=timesteps, step_callback=self._build_step_callback(context), guidance=self.guidance, - cfg_scale=self.cfg_scale, + cfg_scale=cfg_scale, inpaint_extension=inpaint_extension, controlnet_extensions=controlnet_extensions, ) @@ -303,6 +321,55 @@ def _run_diffusion( x = unpack(x.float(), self.height, self.width) return x + @classmethod + def prep_cfg_scale( + cls, cfg_scale: float | list[float], timesteps: list[float], cfg_scale_start_step: int, cfg_scale_end_step: int + ) -> list[float]: + """Prepare the cfg_scale schedule. + + - Clips the cfg_scale schedule based on cfg_scale_start_step and cfg_scale_end_step. + - If cfg_scale is a list, then it is assumed to be a schedule and is returned as-is. + - If cfg_scale is a scalar, then a linear schedule is created from cfg_scale_start_step to cfg_scale_end_step. + """ + # num_steps is the number of denoising steps, which is one less than the number of timesteps. + num_steps = len(timesteps) - 1 + + # Normalize cfg_scale to a list if it is a scalar. + cfg_scale_list: list[float] + if isinstance(cfg_scale, float): + cfg_scale_list = [cfg_scale] * num_steps + elif isinstance(cfg_scale, list): + cfg_scale_list = cfg_scale + else: + raise ValueError(f"Unsupported cfg_scale type: {type(cfg_scale)}") + assert len(cfg_scale_list) == num_steps + + # Handle negative indices for cfg_scale_start_step and cfg_scale_end_step. + start_step_index = cfg_scale_start_step + if start_step_index < 0: + start_step_index = num_steps + start_step_index + end_step_index = cfg_scale_end_step + if end_step_index < 0: + end_step_index = num_steps + end_step_index + + # Validate the start and end step indices. + if not (0 <= start_step_index < num_steps): + raise ValueError(f"Invalid cfg_scale_start_step. Out of range: {cfg_scale_start_step}.") + if not (0 <= end_step_index < num_steps): + raise ValueError(f"Invalid cfg_scale_end_step. Out of range: {cfg_scale_end_step}.") + if start_step_index > end_step_index: + raise ValueError( + f"cfg_scale_start_step ({cfg_scale_start_step}) must be before cfg_scale_end_step " + + f"({cfg_scale_end_step})." + ) + + # Set values outside the start and end step indices to 1.0. This is equivalent to disabling cfg_scale for those + # steps. + clipped_cfg_scale = [1.0] * num_steps + clipped_cfg_scale[start_step_index : end_step_index + 1] = cfg_scale_list[start_step_index : end_step_index + 1] + + return clipped_cfg_scale + def _prep_inpaint_mask(self, context: InvocationContext, latents: torch.Tensor) -> torch.Tensor | None: """Prepare the inpaint mask. diff --git a/invokeai/backend/flux/denoise.py b/invokeai/backend/flux/denoise.py index 92811f76f64..7ce375f4a24 100644 --- a/invokeai/backend/flux/denoise.py +++ b/invokeai/backend/flux/denoise.py @@ -29,7 +29,7 @@ def denoise( timesteps: list[float], step_callback: Callable[[PipelineIntermediateState], None], guidance: float, - cfg_scale: float | list[float], + cfg_scale: list[float], inpaint_extension: InpaintExtension | None, controlnet_extensions: list[XLabsControlNetExtension | InstantXControlNetExtension], ): @@ -84,7 +84,7 @@ def denoise( controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals, ) - step_cfg_scale = cfg_scale[step_index] if isinstance(cfg_scale, list) else cfg_scale + step_cfg_scale = cfg_scale[step_index] # If step_cfg_scale, is 1.0, then we don't need to run the negative prediction. if not math.isclose(step_cfg_scale, 1.0): diff --git a/tests/app/invocations/test_flux_denoise.py b/tests/app/invocations/test_flux_denoise.py new file mode 100644 index 00000000000..412ef7a490a --- /dev/null +++ b/tests/app/invocations/test_flux_denoise.py @@ -0,0 +1,62 @@ +import pytest + +from invokeai.app.invocations.flux_denoise import FluxDenoiseInvocation + +TIMESTEPS = [1.0, 0.75, 0.5, 0.25, 0.0] + + +@pytest.mark.parametrize( + ["cfg_scale", "timesteps", "cfg_scale_start_step", "cfg_scale_end_step", "expected"], + [ + # Test scalar cfg_scale. + (2.0, TIMESTEPS, 0, -1, [2.0, 2.0, 2.0, 2.0]), + # Test list cfg_scale. + ([1.0, 2.0, 3.0, 4.0], TIMESTEPS, 0, -1, [1.0, 2.0, 3.0, 4.0]), + # Test positive cfg_scale_start_step. + (2.0, TIMESTEPS, 1, -1, [1.0, 2.0, 2.0, 2.0]), + # Test positive cfg_scale_end_step. + (2.0, TIMESTEPS, 0, 2, [2.0, 2.0, 2.0, 1.0]), + # Test negative cfg_scale_start_step. + (2.0, TIMESTEPS, -3, -1, [1.0, 2.0, 2.0, 2.0]), + # Test negative cfg_scale_end_step. + (2.0, TIMESTEPS, 0, -2, [2.0, 2.0, 2.0, 1.0]), + # Test single step application. + (2.0, TIMESTEPS, 2, 2, [1.0, 1.0, 2.0, 1.0]), + ], +) +def test_prep_cfg_scale( + cfg_scale: float | list[float], + timesteps: list[float], + cfg_scale_start_step: int, + cfg_scale_end_step: int, + expected: list[float], +): + result = FluxDenoiseInvocation.prep_cfg_scale(cfg_scale, timesteps, cfg_scale_start_step, cfg_scale_end_step) + assert result == expected + + +def test_prep_cfg_scale_invalid_type(): + with pytest.raises(ValueError, match="Unsupported cfg_scale type"): + FluxDenoiseInvocation.prep_cfg_scale("invalid", [1.0, 0.5], 0, -1) # type: ignore + + +@pytest.mark.parametrize("cfg_scale_start_step", [4, -5]) +def test_prep_cfg_scale_invalid_start_step(cfg_scale_start_step: int): + with pytest.raises(ValueError, match="Invalid cfg_scale_start_step"): + FluxDenoiseInvocation.prep_cfg_scale(2.0, TIMESTEPS, cfg_scale_start_step, -1) + + +@pytest.mark.parametrize("cfg_scale_end_step", [4, -5]) +def test_prep_cfg_scale_invalid_end_step(cfg_scale_end_step: int): + with pytest.raises(ValueError, match="Invalid cfg_scale_end_step"): + FluxDenoiseInvocation.prep_cfg_scale(2.0, TIMESTEPS, 0, cfg_scale_end_step) + + +def test_prep_cfg_scale_start_after_end(): + with pytest.raises(ValueError, match="cfg_scale_start_step .* must be before cfg_scale_end_step"): + FluxDenoiseInvocation.prep_cfg_scale(2.0, TIMESTEPS, 3, 2) + + +def test_prep_cfg_scale_list_length_mismatch(): + with pytest.raises(AssertionError): + FluxDenoiseInvocation.prep_cfg_scale([1.0, 2.0, 3.0], TIMESTEPS, 0, -1) From 7bf5927c43342c4413ba39f3e1df220fe7e2cb28 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Fri, 11 Oct 2024 13:12:04 +0000 Subject: [PATCH 05/30] Add XLabs IP-Adapter state dict for unit tests. --- .../xlabs_flux_ip_adapter_state_dict.py | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 tests/backend/flux/ip_adapter/xlabs_flux_ip_adapter_state_dict.py diff --git a/tests/backend/flux/ip_adapter/xlabs_flux_ip_adapter_state_dict.py b/tests/backend/flux/ip_adapter/xlabs_flux_ip_adapter_state_dict.py new file mode 100644 index 00000000000..9d1453aa512 --- /dev/null +++ b/tests/backend/flux/ip_adapter/xlabs_flux_ip_adapter_state_dict.py @@ -0,0 +1,85 @@ +# State dict keys and shapes for an XLabs FLUX IP-Adapter model. Intended to be used for unit tests. +# These keys were extracted from: +# https://huggingface.co/XLabs-AI/flux-ip-adapter/blob/ad16be50d78a07ea83d8c4bde44ff9753235182e/flux-ip-adapter.safetensors +xlabs_sd_shapes = { + "double_blocks.0.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.0.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.0.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.0.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.1.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.1.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.1.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.1.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.10.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.10.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.10.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.10.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.11.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.11.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.11.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.11.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.12.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.12.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.12.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.12.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.13.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.13.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.13.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.13.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.14.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.14.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.14.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.14.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.15.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.15.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.15.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.15.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.16.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.16.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.16.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.16.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.17.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.17.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.17.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.17.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.18.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.18.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.18.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.18.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.2.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.2.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.2.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.2.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.3.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.3.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.3.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.3.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.4.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.4.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.4.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.4.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.5.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.5.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.5.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.5.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.6.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.6.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.6.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.6.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.7.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.7.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.7.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.7.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.8.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.8.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.8.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.8.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "double_blocks.9.processor.ip_adapter_double_stream_k_proj.bias": [3072], + "double_blocks.9.processor.ip_adapter_double_stream_k_proj.weight": [3072, 4096], + "double_blocks.9.processor.ip_adapter_double_stream_v_proj.bias": [3072], + "double_blocks.9.processor.ip_adapter_double_stream_v_proj.weight": [3072, 4096], + "ip_adapter_proj_model.norm.bias": [4096], + "ip_adapter_proj_model.norm.weight": [4096], + "ip_adapter_proj_model.proj.bias": [16384], + "ip_adapter_proj_model.proj.weight": [16384, 768], +} From 9c9af312fe87ac8ccfa96689ebc5c0adce13f0ec Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Fri, 11 Oct 2024 14:11:11 +0000 Subject: [PATCH 06/30] Copy IPDoubleStreamBlockProcessor from https://github.com/XLabs-AI/x-flux/blob/47495425dbed499be1e8e5a6e52628b07349cba2/src/flux/modules/layers.py#L221. --- invokeai/backend/flux/ip_adapter/__init__.py | 0 .../ip_double_stream_block_processor.py | 75 +++++++++++++++++++ 2 files changed, 75 insertions(+) create mode 100644 invokeai/backend/flux/ip_adapter/__init__.py create mode 100644 invokeai/backend/flux/ip_adapter/ip_double_stream_block_processor.py diff --git a/invokeai/backend/flux/ip_adapter/__init__.py b/invokeai/backend/flux/ip_adapter/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/invokeai/backend/flux/ip_adapter/ip_double_stream_block_processor.py b/invokeai/backend/flux/ip_adapter/ip_double_stream_block_processor.py new file mode 100644 index 00000000000..0b75b2a52fb --- /dev/null +++ b/invokeai/backend/flux/ip_adapter/ip_double_stream_block_processor.py @@ -0,0 +1,75 @@ +# This file is based on: +# https://github.com/XLabs-AI/x-flux/blob/47495425dbed499be1e8e5a6e52628b07349cba2/src/flux/modules/layers.py#L221 + + +class IPDoubleStreamBlockProcessor(nn.Module): + """Attention processor for handling IP-adapter with double stream block.""" + + def __init__(self, context_dim, hidden_dim): + super().__init__() + if not hasattr(F, "scaled_dot_product_attention"): + raise ImportError("IPDoubleStreamBlockProcessor requires PyTorch 2.0 or higher. Please upgrade PyTorch.") + + # Ensure context_dim matches the dimension of image_proj + self.context_dim = context_dim + self.hidden_dim = hidden_dim + + # Initialize projections for IP-adapter + self.ip_adapter_double_stream_k_proj = nn.Linear(context_dim, hidden_dim, bias=True) + self.ip_adapter_double_stream_v_proj = nn.Linear(context_dim, hidden_dim, bias=True) + + nn.init.zeros_(self.ip_adapter_double_stream_k_proj.weight) + nn.init.zeros_(self.ip_adapter_double_stream_k_proj.bias) + + nn.init.zeros_(self.ip_adapter_double_stream_v_proj.weight) + nn.init.zeros_(self.ip_adapter_double_stream_v_proj.bias) + + def __call__(self, attn, img, txt, vec, pe, image_proj, ip_scale=1.0, **attention_kwargs): + # Prepare image for attention + img_mod1, img_mod2 = attn.img_mod(vec) + txt_mod1, txt_mod2 = attn.txt_mod(vec) + + img_modulated = attn.img_norm1(img) + img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift + img_qkv = attn.img_attn.qkv(img_modulated) + img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim) + img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v) + + txt_modulated = attn.txt_norm1(txt) + txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift + txt_qkv = attn.txt_attn.qkv(txt_modulated) + txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim) + txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v) + + q = torch.cat((txt_q, img_q), dim=2) + k = torch.cat((txt_k, img_k), dim=2) + v = torch.cat((txt_v, img_v), dim=2) + + attn1 = attention(q, k, v, pe=pe) + txt_attn, img_attn = attn1[:, : txt.shape[1]], attn1[:, txt.shape[1] :] + + # print(f"txt_attn shape: {txt_attn.size()}") + # print(f"img_attn shape: {img_attn.size()}") + + img = img + img_mod1.gate * attn.img_attn.proj(img_attn) + img = img + img_mod2.gate * attn.img_mlp((1 + img_mod2.scale) * attn.img_norm2(img) + img_mod2.shift) + + txt = txt + txt_mod1.gate * attn.txt_attn.proj(txt_attn) + txt = txt + txt_mod2.gate * attn.txt_mlp((1 + txt_mod2.scale) * attn.txt_norm2(txt) + txt_mod2.shift) + + # IP-adapter processing + ip_query = img_q # latent sample query + ip_key = self.ip_adapter_double_stream_k_proj(image_proj) + ip_value = self.ip_adapter_double_stream_v_proj(image_proj) + + # Reshape projections for multi-head attention + ip_key = rearrange(ip_key, "B L (H D) -> B H L D", H=attn.num_heads, D=attn.head_dim) + ip_value = rearrange(ip_value, "B L (H D) -> B H L D", H=attn.num_heads, D=attn.head_dim) + + # Compute attention between IP projections and the latent query + ip_attention = F.scaled_dot_product_attention(ip_query, ip_key, ip_value, dropout_p=0.0, is_causal=False) + ip_attention = rearrange(ip_attention, "B H L D -> B L (H D)", H=attn.num_heads, D=attn.head_dim) + + img = img + ip_scale * ip_attention + + return img, txt From ac7441e606a2d4bfda9b0924bd268ac56113e26f Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Fri, 11 Oct 2024 14:19:37 +0000 Subject: [PATCH 07/30] Fixup typing/imports for IPDoubleStreamBlockProcessor. --- .../ip_double_stream_block_processor.py | 58 ++++++++++++------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/invokeai/backend/flux/ip_adapter/ip_double_stream_block_processor.py b/invokeai/backend/flux/ip_adapter/ip_double_stream_block_processor.py index 0b75b2a52fb..9b1bef7f707 100644 --- a/invokeai/backend/flux/ip_adapter/ip_double_stream_block_processor.py +++ b/invokeai/backend/flux/ip_adapter/ip_double_stream_block_processor.py @@ -1,30 +1,42 @@ # This file is based on: # https://github.com/XLabs-AI/x-flux/blob/47495425dbed499be1e8e5a6e52628b07349cba2/src/flux/modules/layers.py#L221 +import einops +import torch +from invokeai.backend.flux.math import attention +from invokeai.backend.flux.modules.layers import DoubleStreamBlock -class IPDoubleStreamBlockProcessor(nn.Module): + +class IPDoubleStreamBlockProcessor(torch.nn.Module): """Attention processor for handling IP-adapter with double stream block.""" - def __init__(self, context_dim, hidden_dim): + def __init__(self, context_dim: int, hidden_dim: int): super().__init__() - if not hasattr(F, "scaled_dot_product_attention"): - raise ImportError("IPDoubleStreamBlockProcessor requires PyTorch 2.0 or higher. Please upgrade PyTorch.") # Ensure context_dim matches the dimension of image_proj self.context_dim = context_dim self.hidden_dim = hidden_dim # Initialize projections for IP-adapter - self.ip_adapter_double_stream_k_proj = nn.Linear(context_dim, hidden_dim, bias=True) - self.ip_adapter_double_stream_v_proj = nn.Linear(context_dim, hidden_dim, bias=True) - - nn.init.zeros_(self.ip_adapter_double_stream_k_proj.weight) - nn.init.zeros_(self.ip_adapter_double_stream_k_proj.bias) - - nn.init.zeros_(self.ip_adapter_double_stream_v_proj.weight) - nn.init.zeros_(self.ip_adapter_double_stream_v_proj.bias) - - def __call__(self, attn, img, txt, vec, pe, image_proj, ip_scale=1.0, **attention_kwargs): + self.ip_adapter_double_stream_k_proj = torch.nn.Linear(context_dim, hidden_dim, bias=True) + self.ip_adapter_double_stream_v_proj = torch.nn.Linear(context_dim, hidden_dim, bias=True) + + torch.nn.init.zeros_(self.ip_adapter_double_stream_k_proj.weight) + torch.nn.init.zeros_(self.ip_adapter_double_stream_k_proj.bias) + + torch.nn.init.zeros_(self.ip_adapter_double_stream_v_proj.weight) + torch.nn.init.zeros_(self.ip_adapter_double_stream_v_proj.bias) + + def __call__( + self, + attn: DoubleStreamBlock, + img: torch.Tensor, + txt: torch.Tensor, + vec: torch.Tensor, + pe: torch.Tensor, + image_proj: torch.Tensor, + ip_scale: float = 1.0, + ): # Prepare image for attention img_mod1, img_mod2 = attn.img_mod(vec) txt_mod1, txt_mod2 = attn.txt_mod(vec) @@ -32,13 +44,17 @@ def __call__(self, attn, img, txt, vec, pe, image_proj, ip_scale=1.0, **attentio img_modulated = attn.img_norm1(img) img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift img_qkv = attn.img_attn.qkv(img_modulated) - img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim) + img_q, img_k, img_v = einops.rearrange( + img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim + ) img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v) txt_modulated = attn.txt_norm1(txt) txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift txt_qkv = attn.txt_attn.qkv(txt_modulated) - txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim) + txt_q, txt_k, txt_v = einops.rearrange( + txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim + ) txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v) q = torch.cat((txt_q, img_q), dim=2) @@ -63,12 +79,14 @@ def __call__(self, attn, img, txt, vec, pe, image_proj, ip_scale=1.0, **attentio ip_value = self.ip_adapter_double_stream_v_proj(image_proj) # Reshape projections for multi-head attention - ip_key = rearrange(ip_key, "B L (H D) -> B H L D", H=attn.num_heads, D=attn.head_dim) - ip_value = rearrange(ip_value, "B L (H D) -> B H L D", H=attn.num_heads, D=attn.head_dim) + ip_key = einops.rearrange(ip_key, "B L (H D) -> B H L D", H=attn.num_heads, D=attn.head_dim) + ip_value = einops.rearrange(ip_value, "B L (H D) -> B H L D", H=attn.num_heads, D=attn.head_dim) # Compute attention between IP projections and the latent query - ip_attention = F.scaled_dot_product_attention(ip_query, ip_key, ip_value, dropout_p=0.0, is_causal=False) - ip_attention = rearrange(ip_attention, "B H L D -> B L (H D)", H=attn.num_heads, D=attn.head_dim) + ip_attention = torch.nn.functional.scaled_dot_product_attention( + ip_query, ip_key, ip_value, dropout_p=0.0, is_causal=False + ) + ip_attention = einops.rearrange(ip_attention, "B H L D -> B L (H D)", H=attn.num_heads, D=attn.head_dim) img = img + ip_scale * ip_attention From 95c30f6a8be18777d4f0d48ecfe1928cfe2bc552 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Fri, 11 Oct 2024 17:15:10 +0000 Subject: [PATCH 08/30] Add initial logic for inferring FLUX IP-Adapter params from a state_dict. --- .../flux/ip_adapter/xlabs_ip_adapter_flux.py | 60 +++++++++++++++++++ .../ip_adapter/test_xlabs_ip_adapter_flux.py | 17 ++++++ 2 files changed, 77 insertions(+) create mode 100644 invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py create mode 100644 tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py diff --git a/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py b/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py new file mode 100644 index 00000000000..63fd1212215 --- /dev/null +++ b/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py @@ -0,0 +1,60 @@ +from dataclasses import dataclass + +import torch + +from invokeai.backend.ip_adapter.ip_adapter import ImageProjModel + + +class IPDoubleStreamBlock(torch.nn.Module): + def __init__(self, context_dim: int, hidden_dim: int): + super().__init__() + + self.context_dim = context_dim + self.hidden_dim = hidden_dim + + self.ip_adapter_double_stream_k_proj = torch.nn.Linear(context_dim, hidden_dim, bias=True) + self.ip_adapter_double_stream_v_proj = torch.nn.Linear(context_dim, hidden_dim, bias=True) + + +class XlabsIpAdapterFlux: + def __init__(self, image_proj: ImageProjModel, double_blocks: list[IPDoubleStreamBlock]): + self.image_proj = image_proj + self.double_blocks = double_blocks + + @classmethod + def from_state_dict(cls, state_dict: dict[str, torch.Tensor]) -> "XlabsIpAdapterFlux": + # TODO + + return cls() + + +@dataclass +class XlabsIpAdapterParams: + num_double_blocks: int + context_dim: int + hidden_dim: int + + clip_embeddings_dim: int + + +def infer_xlabs_ip_adapter_params_from_state_dict(state_dict: dict[str, torch.Tensor]) -> XlabsIpAdapterParams: + num_double_blocks = 0 + context_dim = 0 + hidden_dim = 0 + + # Count the number of double blocks. + double_block_index = 0 + while f"double_blocks.{double_block_index}.processor.ip_adapter_double_stream_k_proj.weight" in state_dict: + double_block_index += 1 + num_double_blocks = double_block_index + + hidden_dim = state_dict["double_blocks.0.processor.ip_adapter_double_stream_k_proj.weight"].shape[0] + context_dim = state_dict["double_blocks.0.processor.ip_adapter_double_stream_k_proj.weight"].shape[1] + clip_embeddings_dim = state_dict["ip_adapter_proj_model.proj.weight"].shape[1] + + return XlabsIpAdapterParams( + num_double_blocks=num_double_blocks, + context_dim=context_dim, + hidden_dim=hidden_dim, + clip_embeddings_dim=clip_embeddings_dim, + ) diff --git a/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py b/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py new file mode 100644 index 00000000000..a4ca8180d03 --- /dev/null +++ b/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py @@ -0,0 +1,17 @@ +import torch + +from invokeai.backend.flux.ip_adapter.xlabs_ip_adapter_flux import infer_xlabs_ip_adapter_params_from_state_dict +from tests.backend.flux.ip_adapter.xlabs_flux_ip_adapter_state_dict import xlabs_sd_shapes + + +def test_infer_xlabs_ip_adapter_params_from_state_dict(): + # Construct a dummy state_dict with tensors of the correct shape on the meta device. + with torch.device("meta"): + sd = {k: torch.zeros(v) for k, v in xlabs_sd_shapes.items()} + + params = infer_xlabs_ip_adapter_params_from_state_dict(sd) + + assert params.num_double_blocks == 19 + assert params.context_dim == 4096 + assert params.hidden_dim == 3072 + assert params.clip_embeddings_dim == 768 From 24a0ca86f5878595886e5b13eabd32c8b738566a Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 15 Oct 2024 13:52:07 +0000 Subject: [PATCH 09/30] Add logic for loading an Xlabs IP-Adapter from a state dict. --- .../flux/ip_adapter/xlabs_ip_adapter_flux.py | 71 +++++++++++++++---- .../ip_adapter/test_xlabs_ip_adapter_flux.py | 21 +++++- 2 files changed, 79 insertions(+), 13 deletions(-) diff --git a/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py b/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py index 63fd1212215..182c8249c0f 100644 --- a/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py +++ b/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py @@ -1,5 +1,6 @@ from dataclasses import dataclass +import accelerate import torch from invokeai.backend.ip_adapter.ip_adapter import ImageProjModel @@ -16,18 +17,6 @@ def __init__(self, context_dim: int, hidden_dim: int): self.ip_adapter_double_stream_v_proj = torch.nn.Linear(context_dim, hidden_dim, bias=True) -class XlabsIpAdapterFlux: - def __init__(self, image_proj: ImageProjModel, double_blocks: list[IPDoubleStreamBlock]): - self.image_proj = image_proj - self.double_blocks = double_blocks - - @classmethod - def from_state_dict(cls, state_dict: dict[str, torch.Tensor]) -> "XlabsIpAdapterFlux": - # TODO - - return cls() - - @dataclass class XlabsIpAdapterParams: num_double_blocks: int @@ -37,6 +26,54 @@ class XlabsIpAdapterParams: clip_embeddings_dim: int +class XlabsIpAdapterFlux(torch.nn.Module): + def __init__(self, params: XlabsIpAdapterParams): + super().__init__() + self.image_proj = ImageProjModel( + cross_attention_dim=params.context_dim, clip_embeddings_dim=params.clip_embeddings_dim + ) + self.double_blocks = torch.nn.ModuleList( + [IPDoubleStreamBlock(params.context_dim, params.hidden_dim) for _ in range(params.num_double_blocks)] + ) + + def load_xlabs_state_dict(self, state_dict: dict[str, torch.Tensor], assign: bool = False): + """We need this custom function to load state dicts rather than using .load_state_dict(...) because the model + structure does not match the state_dict structure. + """ + # Split the state_dict into the image projection model and the double blocks. + image_proj_sd: dict[str, torch.Tensor] = {} + double_blocks_sd: dict[str, torch.Tensor] = {} + for k, v in state_dict.items(): + if k.startswith("ip_adapter_proj_model."): + image_proj_sd[k] = v + elif k.startswith("double_blocks."): + double_blocks_sd[k] = v + else: + raise ValueError(f"Unexpected key: {k}") + + # Initialize the image projection model. + image_proj_sd = {k.replace("ip_adapter_proj_model.", ""): v for k, v in image_proj_sd.items()} + self.image_proj.load_state_dict(image_proj_sd, assign=assign) + + # Initialize the double blocks. + for i, double_block in enumerate(self.double_blocks): + double_block_sd: dict[str, torch.Tensor] = { + "ip_adapter_double_stream_k_proj.bias": double_blocks_sd[ + f"double_blocks.{i}.processor.ip_adapter_double_stream_k_proj.bias" + ], + "ip_adapter_double_stream_k_proj.weight": double_blocks_sd[ + f"double_blocks.{i}.processor.ip_adapter_double_stream_k_proj.weight" + ], + "ip_adapter_double_stream_v_proj.bias": double_blocks_sd[ + f"double_blocks.{i}.processor.ip_adapter_double_stream_v_proj.bias" + ], + "ip_adapter_double_stream_v_proj.weight": double_blocks_sd[ + f"double_blocks.{i}.processor.ip_adapter_double_stream_v_proj.weight" + ], + } + double_block.load_state_dict(double_block_sd, assign=assign) + + def infer_xlabs_ip_adapter_params_from_state_dict(state_dict: dict[str, torch.Tensor]) -> XlabsIpAdapterParams: num_double_blocks = 0 context_dim = 0 @@ -58,3 +95,13 @@ def infer_xlabs_ip_adapter_params_from_state_dict(state_dict: dict[str, torch.Te hidden_dim=hidden_dim, clip_embeddings_dim=clip_embeddings_dim, ) + + +def load_xlabs_ip_adapter_flux(state_dict: dict[str, torch.Tensor]) -> XlabsIpAdapterFlux: + params = infer_xlabs_ip_adapter_params_from_state_dict(state_dict) + + with accelerate.init_empty_weights(): + model = XlabsIpAdapterFlux(params=params) + + model.load_xlabs_state_dict(state_dict) + return model diff --git a/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py b/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py index a4ca8180d03..c893fec2b81 100644 --- a/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py +++ b/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py @@ -1,6 +1,10 @@ +import accelerate import torch -from invokeai.backend.flux.ip_adapter.xlabs_ip_adapter_flux import infer_xlabs_ip_adapter_params_from_state_dict +from invokeai.backend.flux.ip_adapter.xlabs_ip_adapter_flux import ( + XlabsIpAdapterFlux, + infer_xlabs_ip_adapter_params_from_state_dict, +) from tests.backend.flux.ip_adapter.xlabs_flux_ip_adapter_state_dict import xlabs_sd_shapes @@ -15,3 +19,18 @@ def test_infer_xlabs_ip_adapter_params_from_state_dict(): assert params.context_dim == 4096 assert params.hidden_dim == 3072 assert params.clip_embeddings_dim == 768 + + +def test_initialize_xlabs_ip_adapter_flux_from_state_dict(): + # Construct a dummy state_dict with tensors of the correct shape on the meta device. + with torch.device("meta"): + sd = {k: torch.zeros(v) for k, v in xlabs_sd_shapes.items()} + + # Initialize the XLabs IP-Adapter from the state_dict. + params = infer_xlabs_ip_adapter_params_from_state_dict(sd) + + with accelerate.init_empty_weights(): + model = XlabsIpAdapterFlux(params=params) + + # Smoke test state_dict loading. + model.load_xlabs_state_dict(sd) From f939dbdc339b3ecab5a1e69282f58c378b976a66 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 15 Oct 2024 14:49:00 +0000 Subject: [PATCH 10/30] Add is_state_dict_xlabs_ip_adapter() utility function. --- .../flux/ip_adapter/state_dict_utils.py | 23 +++++++++++++++++++ .../ip_adapter/test_xlabs_ip_adapter_flux.py | 8 +++++++ 2 files changed, 31 insertions(+) create mode 100644 invokeai/backend/flux/ip_adapter/state_dict_utils.py diff --git a/invokeai/backend/flux/ip_adapter/state_dict_utils.py b/invokeai/backend/flux/ip_adapter/state_dict_utils.py new file mode 100644 index 00000000000..96d724f242f --- /dev/null +++ b/invokeai/backend/flux/ip_adapter/state_dict_utils.py @@ -0,0 +1,23 @@ +from typing import Any, Dict + + +def is_state_dict_xlabs_ip_adapter(sd: Dict[str, Any]) -> bool: + """Is the state dict for an XLabs FLUX IP-Adapter model? + + This is intended to be a reasonably high-precision detector, but it is not guaranteed to have perfect precision. + """ + # If all of the expected keys are present, then this is very likely an XLabs IP-Adapter model. + expected_keys = { + "double_blocks.0.processor.ip_adapter_double_stream_k_proj.bias", + "double_blocks.0.processor.ip_adapter_double_stream_k_proj.weight", + "double_blocks.0.processor.ip_adapter_double_stream_v_proj.bias", + "double_blocks.0.processor.ip_adapter_double_stream_v_proj.weight", + "ip_adapter_proj_model.norm.bias", + "ip_adapter_proj_model.norm.weight", + "ip_adapter_proj_model.proj.bias", + "ip_adapter_proj_model.proj.weight", + } + + if expected_keys.issubset(sd.keys()): + return True + return False diff --git a/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py b/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py index c893fec2b81..6ffb36aeeb8 100644 --- a/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py +++ b/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py @@ -1,6 +1,7 @@ import accelerate import torch +from invokeai.backend.flux.ip_adapter.state_dict_utils import is_state_dict_xlabs_ip_adapter from invokeai.backend.flux.ip_adapter.xlabs_ip_adapter_flux import ( XlabsIpAdapterFlux, infer_xlabs_ip_adapter_params_from_state_dict, @@ -8,6 +9,13 @@ from tests.backend.flux.ip_adapter.xlabs_flux_ip_adapter_state_dict import xlabs_sd_shapes +def test_is_state_dict_xlabs_ip_adapter(): + # Construct a dummy state_dict. + sd = {k: None for k in xlabs_sd_shapes} + + assert is_state_dict_xlabs_ip_adapter(sd) + + def test_infer_xlabs_ip_adapter_params_from_state_dict(): # Construct a dummy state_dict with tensors of the correct shape on the meta device. with torch.device("meta"): From 412e79d8e6b16340363aac4567897f505a2dd387 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 15 Oct 2024 14:58:04 +0000 Subject: [PATCH 11/30] Add model probing for XLabs FLUX IP-Adapter. --- invokeai/backend/model_manager/probe.py | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/invokeai/backend/model_manager/probe.py b/invokeai/backend/model_manager/probe.py index fe8b5669991..c7f3062aa32 100644 --- a/invokeai/backend/model_manager/probe.py +++ b/invokeai/backend/model_manager/probe.py @@ -14,6 +14,7 @@ is_state_dict_instantx_controlnet, is_state_dict_xlabs_controlnet, ) +from invokeai.backend.flux.ip_adapter.state_dict_utils import is_state_dict_xlabs_ip_adapter from invokeai.backend.lora.conversions.flux_diffusers_lora_conversion_utils import ( is_state_dict_likely_in_flux_diffusers_format, ) @@ -243,8 +244,6 @@ def get_model_type_from_checkpoint(cls, model_path: Path, checkpoint: Optional[C "cond_stage_model.", "first_stage_model.", "model.diffusion_model.", - # FLUX models in the official BFL format contain keys with the "double_blocks." prefix. - "double_blocks.", # Some FLUX checkpoint files contain transformer keys prefixed with "model.diffusion_model". # This prefix is typically used to distinguish between multiple models bundled in a single file. "model.diffusion_model.double_blocks.", @@ -252,6 +251,10 @@ def get_model_type_from_checkpoint(cls, model_path: Path, checkpoint: Optional[C ): # Keys starting with double_blocks are associated with Flux models return ModelType.Main + # FLUX models in the official BFL format contain keys with the "double_blocks." prefix, but we must be + # careful to avoid false positives on XLabs FLUX IP-Adapter models. + elif key.startswith("double_blocks.") and "ip_adapter" not in key: + return ModelType.Main elif key.startswith(("encoder.conv_in", "decoder.conv_in")): return ModelType.VAE elif key.startswith(("lora_te_", "lora_unet_")): @@ -274,7 +277,14 @@ def get_model_type_from_checkpoint(cls, model_path: Path, checkpoint: Optional[C ) ): return ModelType.ControlNet - elif key.startswith(("image_proj.", "ip_adapter.")): + elif key.startswith( + ( + "image_proj.", + "ip_adapter.", + # XLabs FLUX IP-Adapter models have keys startinh with "ip_adapter_proj_model.". + "ip_adapter_proj_model.", + ) + ): return ModelType.IPAdapter elif key in {"emb_params", "string_to_param"}: return ModelType.TextualInversion @@ -672,6 +682,10 @@ class IPAdapterCheckpointProbe(CheckpointProbeBase): def get_base_type(self) -> BaseModelType: checkpoint = self.checkpoint + + if is_state_dict_xlabs_ip_adapter(checkpoint): + return BaseModelType.Flux + for key in checkpoint.keys(): if not key.startswith(("image_proj.", "ip_adapter.")): continue From d6643d726376f4ff1ef0cd491e7e864c272bbe3d Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 15 Oct 2024 14:58:37 +0000 Subject: [PATCH 12/30] Add model loading code for xlabs FLUX IP-Adapter (not tested). --- .../flux/ip_adapter/xlabs_ip_adapter_flux.py | 11 -------- .../model_manager/load/model_loaders/flux.py | 28 +++++++++++++++++++ 2 files changed, 28 insertions(+), 11 deletions(-) diff --git a/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py b/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py index 182c8249c0f..3de2ed2a157 100644 --- a/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py +++ b/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py @@ -1,6 +1,5 @@ from dataclasses import dataclass -import accelerate import torch from invokeai.backend.ip_adapter.ip_adapter import ImageProjModel @@ -95,13 +94,3 @@ def infer_xlabs_ip_adapter_params_from_state_dict(state_dict: dict[str, torch.Te hidden_dim=hidden_dim, clip_embeddings_dim=clip_embeddings_dim, ) - - -def load_xlabs_ip_adapter_flux(state_dict: dict[str, torch.Tensor]) -> XlabsIpAdapterFlux: - params = infer_xlabs_ip_adapter_params_from_state_dict(state_dict) - - with accelerate.init_empty_weights(): - model = XlabsIpAdapterFlux(params=params) - - model.load_xlabs_state_dict(state_dict) - return model diff --git a/invokeai/backend/model_manager/load/model_loaders/flux.py b/invokeai/backend/model_manager/load/model_loaders/flux.py index b82a17c69a1..8d9c3f6f432 100644 --- a/invokeai/backend/model_manager/load/model_loaders/flux.py +++ b/invokeai/backend/model_manager/load/model_loaders/flux.py @@ -19,6 +19,10 @@ is_state_dict_xlabs_controlnet, ) from invokeai.backend.flux.controlnet.xlabs_controlnet_flux import XLabsControlNetFlux +from invokeai.backend.flux.ip_adapter.xlabs_ip_adapter_flux import ( + XlabsIpAdapterFlux, + infer_xlabs_ip_adapter_params_from_state_dict, +) from invokeai.backend.flux.model import Flux from invokeai.backend.flux.modules.autoencoder import AutoEncoder from invokeai.backend.flux.util import ae_params, params @@ -35,6 +39,7 @@ CLIPEmbedDiffusersConfig, ControlNetCheckpointConfig, ControlNetDiffusersConfig, + IPAdapterCheckpointConfig, MainBnbQuantized4bCheckpointConfig, MainCheckpointConfig, MainGGUFCheckpointConfig, @@ -352,3 +357,26 @@ def _load_instantx_controlnet(self, sd: dict[str, torch.Tensor]) -> AnyModel: model.load_state_dict(sd, assign=True) return model + + +@ModelLoaderRegistry.register(base=BaseModelType.Flux, type=ModelType.IPAdapter, format=ModelFormat.Checkpoint) +class FluxIpAdapterModel(ModelLoader): + """Class to load FLUX IP-Adapter models.""" + + def _load_model( + self, + config: AnyModelConfig, + submodel_type: Optional[SubModelType] = None, + ) -> AnyModel: + if not isinstance(config, IPAdapterCheckpointConfig): + raise ValueError(f"Unexpected model config type: {type(config)}.") + + sd = load_file(Path(config.path)) + + params = infer_xlabs_ip_adapter_params_from_state_dict(sd) + + with accelerate.init_empty_weights(): + model = XlabsIpAdapterFlux(params=params) + + model.load_xlabs_state_dict(sd, assign=True) + return model From c2a8fbd8d65ccdcadf305b25804591739c400f1a Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 15 Oct 2024 15:02:03 +0000 Subject: [PATCH 13/30] (minor) Move infer_xlabs_ip_adapter_params_from_state_dict(...) to state_dict_utils.py. --- .../flux/ip_adapter/state_dict_utils.py | 27 +++++++++++++++++++ .../flux/ip_adapter/xlabs_ip_adapter_flux.py | 23 ---------------- .../model_manager/load/model_loaders/flux.py | 2 +- .../ip_adapter/test_xlabs_ip_adapter_flux.py | 6 +++-- 4 files changed, 32 insertions(+), 26 deletions(-) diff --git a/invokeai/backend/flux/ip_adapter/state_dict_utils.py b/invokeai/backend/flux/ip_adapter/state_dict_utils.py index 96d724f242f..dff4978480f 100644 --- a/invokeai/backend/flux/ip_adapter/state_dict_utils.py +++ b/invokeai/backend/flux/ip_adapter/state_dict_utils.py @@ -1,5 +1,9 @@ from typing import Any, Dict +import torch + +from invokeai.backend.flux.ip_adapter.xlabs_ip_adapter_flux import XlabsIpAdapterParams + def is_state_dict_xlabs_ip_adapter(sd: Dict[str, Any]) -> bool: """Is the state dict for an XLabs FLUX IP-Adapter model? @@ -21,3 +25,26 @@ def is_state_dict_xlabs_ip_adapter(sd: Dict[str, Any]) -> bool: if expected_keys.issubset(sd.keys()): return True return False + + +def infer_xlabs_ip_adapter_params_from_state_dict(state_dict: dict[str, torch.Tensor]) -> XlabsIpAdapterParams: + num_double_blocks = 0 + context_dim = 0 + hidden_dim = 0 + + # Count the number of double blocks. + double_block_index = 0 + while f"double_blocks.{double_block_index}.processor.ip_adapter_double_stream_k_proj.weight" in state_dict: + double_block_index += 1 + num_double_blocks = double_block_index + + hidden_dim = state_dict["double_blocks.0.processor.ip_adapter_double_stream_k_proj.weight"].shape[0] + context_dim = state_dict["double_blocks.0.processor.ip_adapter_double_stream_k_proj.weight"].shape[1] + clip_embeddings_dim = state_dict["ip_adapter_proj_model.proj.weight"].shape[1] + + return XlabsIpAdapterParams( + num_double_blocks=num_double_blocks, + context_dim=context_dim, + hidden_dim=hidden_dim, + clip_embeddings_dim=clip_embeddings_dim, + ) diff --git a/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py b/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py index 3de2ed2a157..152c391059e 100644 --- a/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py +++ b/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py @@ -71,26 +71,3 @@ def load_xlabs_state_dict(self, state_dict: dict[str, torch.Tensor], assign: boo ], } double_block.load_state_dict(double_block_sd, assign=assign) - - -def infer_xlabs_ip_adapter_params_from_state_dict(state_dict: dict[str, torch.Tensor]) -> XlabsIpAdapterParams: - num_double_blocks = 0 - context_dim = 0 - hidden_dim = 0 - - # Count the number of double blocks. - double_block_index = 0 - while f"double_blocks.{double_block_index}.processor.ip_adapter_double_stream_k_proj.weight" in state_dict: - double_block_index += 1 - num_double_blocks = double_block_index - - hidden_dim = state_dict["double_blocks.0.processor.ip_adapter_double_stream_k_proj.weight"].shape[0] - context_dim = state_dict["double_blocks.0.processor.ip_adapter_double_stream_k_proj.weight"].shape[1] - clip_embeddings_dim = state_dict["ip_adapter_proj_model.proj.weight"].shape[1] - - return XlabsIpAdapterParams( - num_double_blocks=num_double_blocks, - context_dim=context_dim, - hidden_dim=hidden_dim, - clip_embeddings_dim=clip_embeddings_dim, - ) diff --git a/invokeai/backend/model_manager/load/model_loaders/flux.py b/invokeai/backend/model_manager/load/model_loaders/flux.py index 8d9c3f6f432..af1101f62da 100644 --- a/invokeai/backend/model_manager/load/model_loaders/flux.py +++ b/invokeai/backend/model_manager/load/model_loaders/flux.py @@ -19,9 +19,9 @@ is_state_dict_xlabs_controlnet, ) from invokeai.backend.flux.controlnet.xlabs_controlnet_flux import XLabsControlNetFlux +from invokeai.backend.flux.ip_adapter.state_dict_utils import infer_xlabs_ip_adapter_params_from_state_dict from invokeai.backend.flux.ip_adapter.xlabs_ip_adapter_flux import ( XlabsIpAdapterFlux, - infer_xlabs_ip_adapter_params_from_state_dict, ) from invokeai.backend.flux.model import Flux from invokeai.backend.flux.modules.autoencoder import AutoEncoder diff --git a/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py b/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py index 6ffb36aeeb8..1c88304ea13 100644 --- a/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py +++ b/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py @@ -1,10 +1,12 @@ import accelerate import torch -from invokeai.backend.flux.ip_adapter.state_dict_utils import is_state_dict_xlabs_ip_adapter +from invokeai.backend.flux.ip_adapter.state_dict_utils import ( + infer_xlabs_ip_adapter_params_from_state_dict, + is_state_dict_xlabs_ip_adapter, +) from invokeai.backend.flux.ip_adapter.xlabs_ip_adapter_flux import ( XlabsIpAdapterFlux, - infer_xlabs_ip_adapter_params_from_state_dict, ) from tests.backend.flux.ip_adapter.xlabs_flux_ip_adapter_state_dict import xlabs_sd_shapes From 3fa10128794d23bbb7ecd9b7e74c610fbad548a6 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 15 Oct 2024 15:31:26 +0000 Subject: [PATCH 14/30] Add IPAdapterDoubleBlocks wrapper to tidy FLUX ip-adapter handling. --- .../flux/ip_adapter/xlabs_ip_adapter_flux.py | 30 ++++++++----------- 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py b/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py index 152c391059e..cfe72eb54b9 100644 --- a/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py +++ b/invokeai/backend/flux/ip_adapter/xlabs_ip_adapter_flux.py @@ -16,6 +16,14 @@ def __init__(self, context_dim: int, hidden_dim: int): self.ip_adapter_double_stream_v_proj = torch.nn.Linear(context_dim, hidden_dim, bias=True) +class IPAdapterDoubleBlocks(torch.nn.Module): + def __init__(self, num_double_blocks: int, context_dim: int, hidden_dim: int): + super().__init__() + self.double_blocks = torch.nn.ModuleList( + [IPDoubleStreamBlock(context_dim, hidden_dim) for _ in range(num_double_blocks)] + ) + + @dataclass class XlabsIpAdapterParams: num_double_blocks: int @@ -31,8 +39,8 @@ def __init__(self, params: XlabsIpAdapterParams): self.image_proj = ImageProjModel( cross_attention_dim=params.context_dim, clip_embeddings_dim=params.clip_embeddings_dim ) - self.double_blocks = torch.nn.ModuleList( - [IPDoubleStreamBlock(params.context_dim, params.hidden_dim) for _ in range(params.num_double_blocks)] + self.ip_adapter_double_blocks = IPAdapterDoubleBlocks( + num_double_blocks=params.num_double_blocks, context_dim=params.context_dim, hidden_dim=params.hidden_dim ) def load_xlabs_state_dict(self, state_dict: dict[str, torch.Tensor], assign: bool = False): @@ -55,19 +63,5 @@ def load_xlabs_state_dict(self, state_dict: dict[str, torch.Tensor], assign: boo self.image_proj.load_state_dict(image_proj_sd, assign=assign) # Initialize the double blocks. - for i, double_block in enumerate(self.double_blocks): - double_block_sd: dict[str, torch.Tensor] = { - "ip_adapter_double_stream_k_proj.bias": double_blocks_sd[ - f"double_blocks.{i}.processor.ip_adapter_double_stream_k_proj.bias" - ], - "ip_adapter_double_stream_k_proj.weight": double_blocks_sd[ - f"double_blocks.{i}.processor.ip_adapter_double_stream_k_proj.weight" - ], - "ip_adapter_double_stream_v_proj.bias": double_blocks_sd[ - f"double_blocks.{i}.processor.ip_adapter_double_stream_v_proj.bias" - ], - "ip_adapter_double_stream_v_proj.weight": double_blocks_sd[ - f"double_blocks.{i}.processor.ip_adapter_double_stream_v_proj.weight" - ], - } - double_block.load_state_dict(double_block_sd, assign=assign) + double_blocks_sd = {k.replace("processor.", ""): v for k, v in double_blocks_sd.items()} + self.ip_adapter_double_blocks.load_state_dict(double_blocks_sd, assign=assign) From 31ffd734233c6a917b33dbdc95adb736a854bbcb Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 15 Oct 2024 22:28:59 +0000 Subject: [PATCH 15/30] Initial draft of integrating FLUX IP-Adapter inference support. --- invokeai/app/invocations/flux_denoise.py | 92 +++++++++++++++++++ .../backend/flux/custom_block_processor.py | 83 +++++++++++++++++ invokeai/backend/flux/denoise.py | 5 + .../extensions/xlabs_ip_adapter_extension.py | 89 ++++++++++++++++++ invokeai/backend/flux/model.py | 19 +++- 5 files changed, 287 insertions(+), 1 deletion(-) create mode 100644 invokeai/backend/flux/custom_block_processor.py create mode 100644 invokeai/backend/flux/extensions/xlabs_ip_adapter_extension.py diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py index 8120ac400f5..1b7dea7b607 100644 --- a/invokeai/app/invocations/flux_denoise.py +++ b/invokeai/app/invocations/flux_denoise.py @@ -4,12 +4,14 @@ import torch import torchvision.transforms as tv_transforms from torchvision.transforms.functional import resize as tv_resize +from transformers import CLIPVisionModelWithProjection from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation from invokeai.app.invocations.fields import ( DenoiseMaskField, FieldDescriptions, FluxConditioningField, + ImageField, Input, InputField, LatentsField, @@ -17,6 +19,7 @@ WithMetadata, ) from invokeai.app.invocations.flux_controlnet import FluxControlNetField +from invokeai.app.invocations.ip_adapter import IPAdapterField from invokeai.app.invocations.model import TransformerField, VAEField from invokeai.app.invocations.primitives import LatentsOutput from invokeai.app.services.shared.invocation_context import InvocationContext @@ -26,6 +29,8 @@ from invokeai.backend.flux.extensions.inpaint_extension import InpaintExtension from invokeai.backend.flux.extensions.instantx_controlnet_extension import InstantXControlNetExtension from invokeai.backend.flux.extensions.xlabs_controlnet_extension import XLabsControlNetExtension +from invokeai.backend.flux.extensions.xlabs_ip_adapter_extension import XLabsIPAdapterExtension +from invokeai.backend.flux.ip_adapter.xlabs_ip_adapter_flux import XlabsIpAdapterFlux from invokeai.backend.flux.model import Flux from invokeai.backend.flux.sampling_utils import ( clip_timestep_schedule_fractional, @@ -118,6 +123,10 @@ class FluxDenoiseInvocation(BaseInvocation, WithMetadata, WithBoard): input=Input.Connection, ) + ip_adapter: IPAdapterField | list[IPAdapterField] | None = InputField( + description=FieldDescriptions.ip_adapter, title="IP-Adapter", default=None, input=Input.Connection + ) + @torch.no_grad() def invoke(self, context: InvocationContext) -> LatentsOutput: latents = self._run_diffusion(context) @@ -245,6 +254,12 @@ def _run_diffusion( noise=noise, ) + # Compute the IP-Adapter image prompt clip embeddings. + # We do this before loading other models to minimize peak memory. + # TODO(ryand): We should really do this in a separate invocation to benefit from caching. + ip_adapter_fields = self._normalize_ip_adapter_fields() + image_prompt_clip_embeds = self._prep_ip_adapter_image_prompt_clip_embeds(ip_adapter_fields, context) + cfg_scale = self.prep_cfg_scale( cfg_scale=self.cfg_scale, timesteps=timesteps, @@ -300,6 +315,15 @@ def _run_diffusion( else: raise ValueError(f"Unsupported model format: {config.format}") + # Prepare IP-Adapter extensions. + ip_adapter_extensions = self._prep_ip_adapter_extensions( + image_prompt_clip_embeds=image_prompt_clip_embeds, + ip_adapter_fields=ip_adapter_fields, + context=context, + exit_stack=exit_stack, + dtype=inference_dtype, + ) + x = denoise( model=transformer, img=x, @@ -316,6 +340,7 @@ def _run_diffusion( cfg_scale=cfg_scale, inpaint_extension=inpaint_extension, controlnet_extensions=controlnet_extensions, + ip_adapter_extensions=ip_adapter_extensions, ) x = unpack(x.float(), self.height, self.width) @@ -509,6 +534,73 @@ def _prep_controlnet_extensions( return controlnet_extensions + def _normalize_ip_adapter_fields(self) -> list[IPAdapterField]: + if self.ip_adapter is None: + return [] + elif isinstance(self.ip_adapter, IPAdapterField): + return [self.ip_adapter] + elif isinstance(self.ip_adapter, list): + return self.ip_adapter + else: + raise ValueError(f"Unsupported IP-Adapter type: {type(self.ip_adapter)}") + + def _prep_ip_adapter_image_prompt_clip_embeds( + self, + ip_adapter_fields: list[IPAdapterField], + context: InvocationContext, + ) -> list[torch.Tensor]: + """Run the IPAdapter CLIPVisionModel, returning image prompt embeddings.""" + image_prompt_clip_embeds: list[torch.Tensor] = [] + for ip_adapter_field in ip_adapter_fields: + # `ip_adapter_field.image` could be a list or a single ImageField. Normalize to a list here. + ipa_image_fields: list[ImageField] + if isinstance(ip_adapter_field.image, ImageField): + ipa_image_fields = [ip_adapter_field.image] + elif isinstance(ip_adapter_field.image, list): + ipa_image_fields = ip_adapter_field.image + else: + raise ValueError(f"Unsupported IP-Adapter image type: {type(ip_adapter_field.image)}") + + ipa_images = [context.images.get_pil(image.image_name) for image in ipa_image_fields] + + with context.models.load(ip_adapter_field.image_encoder_model) as image_encoder_model: + assert isinstance(image_encoder_model, CLIPVisionModelWithProjection) + image_prompt_clip_embeds.append( + XLabsIPAdapterExtension.run_clip_image_encoder( + pil_image=ipa_images, + image_encoder=image_encoder_model, + ) + ) + return image_prompt_clip_embeds + + def _prep_ip_adapter_extensions( + self, + ip_adapter_fields: list[IPAdapterField], + image_prompt_clip_embeds: list[torch.Tensor], + context: InvocationContext, + exit_stack: ExitStack, + dtype: torch.dtype, + ) -> list[XLabsIPAdapterExtension]: + ip_adapter_extensions: list[XLabsIPAdapterExtension] = [] + for ip_adapter_field, image_prompt_clip_embed in zip(ip_adapter_fields, image_prompt_clip_embeds, strict=True): + ip_adapter_model = exit_stack.enter_context(context.models.load(ip_adapter_field.ip_adapter_model)) + assert isinstance(ip_adapter_model, XlabsIpAdapterFlux) + ip_adapter_model = ip_adapter_model.to(dtype=dtype) + if ip_adapter_field.mask is not None: + raise ValueError("IP-Adapter masks are not yet supported in Flux.") + ip_adapter_extension = XLabsIPAdapterExtension( + model=ip_adapter_model, + image_prompt_clip_embed=image_prompt_clip_embed, + weight=ip_adapter_field.weight, + begin_step_percent=ip_adapter_field.begin_step_percent, + end_step_percent=ip_adapter_field.end_step_percent, + ) + + ip_adapter_extension.run_image_proj(dtype=dtype) + ip_adapter_extensions.append(ip_adapter_extension) + + return ip_adapter_extensions + def _lora_iterator(self, context: InvocationContext) -> Iterator[Tuple[LoRAModelRaw, float]]: for lora in self.transformer.loras: lora_info = context.models.load(lora.lora) diff --git a/invokeai/backend/flux/custom_block_processor.py b/invokeai/backend/flux/custom_block_processor.py new file mode 100644 index 00000000000..e0c7779e935 --- /dev/null +++ b/invokeai/backend/flux/custom_block_processor.py @@ -0,0 +1,83 @@ +import einops +import torch + +from invokeai.backend.flux.extensions.xlabs_ip_adapter_extension import XLabsIPAdapterExtension +from invokeai.backend.flux.math import attention +from invokeai.backend.flux.modules.layers import DoubleStreamBlock + + +class CustomDoubleStreamBlockProcessor: + """A class containing a custom implementation of DoubleStreamBlock.forward() with additional features + (IP-Adapter, etc.). + """ + + @staticmethod + def _double_stream_block_forward( + block: DoubleStreamBlock, img: torch.Tensor, txt: torch.Tensor, vec: torch.Tensor, pe: torch.Tensor + ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """This function is a direct copy of DoubleStreamBlock.forward(), but it returns some of the intermediate + values. + """ + img_mod1, img_mod2 = block.img_mod(vec) + txt_mod1, txt_mod2 = block.txt_mod(vec) + + # prepare image for attention + img_modulated = block.img_norm1(img) + img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift + img_qkv = block.img_attn.qkv(img_modulated) + img_q, img_k, img_v = einops.rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=block.num_heads) + img_q, img_k = block.img_attn.norm(img_q, img_k, img_v) + + # prepare txt for attention + txt_modulated = block.txt_norm1(txt) + txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift + txt_qkv = block.txt_attn.qkv(txt_modulated) + txt_q, txt_k, txt_v = einops.rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=block.num_heads) + txt_q, txt_k = block.txt_attn.norm(txt_q, txt_k, txt_v) + + # run actual attention + q = torch.cat((txt_q, img_q), dim=2) + k = torch.cat((txt_k, img_k), dim=2) + v = torch.cat((txt_v, img_v), dim=2) + + attn = attention(q, k, v, pe=pe) + txt_attn, img_attn = attn[:, : txt.shape[1]], attn[:, txt.shape[1] :] + + # calculate the img bloks + img = img + img_mod1.gate * block.img_attn.proj(img_attn) + img = img + img_mod2.gate * block.img_mlp((1 + img_mod2.scale) * block.img_norm2(img) + img_mod2.shift) + + # calculate the txt bloks + txt = txt + txt_mod1.gate * block.txt_attn.proj(txt_attn) + txt = txt + txt_mod2.gate * block.txt_mlp((1 + txt_mod2.scale) * block.txt_norm2(txt) + txt_mod2.shift) + return img, txt, img_q + + @staticmethod + def custom_double_block_forward( + timestep_index: int, + total_num_timesteps: int, + block_index: int, + block: DoubleStreamBlock, + img: torch.Tensor, + txt: torch.Tensor, + vec: torch.Tensor, + pe: torch.Tensor, + ip_adapter_extensions: list[XLabsIPAdapterExtension], + ) -> tuple[torch.Tensor, torch.Tensor]: + """A custom implementation of DoubleStreamBlock.forward() with additional features: + - IP-Adapter support + """ + img, txt, img_q = CustomDoubleStreamBlockProcessor._double_stream_block_forward(block, img, txt, vec, pe) + + # Apply IP-Adapter conditioning. + for ip_adapter_extension in ip_adapter_extensions: + img = ip_adapter_extension.run_ip_adapter( + timestep_index=timestep_index, + total_num_timesteps=total_num_timesteps, + block_index=block_index, + block=block, + img_q=img_q, + img=img, + ) + + return img, txt diff --git a/invokeai/backend/flux/denoise.py b/invokeai/backend/flux/denoise.py index 7ce375f4a24..025586f4e02 100644 --- a/invokeai/backend/flux/denoise.py +++ b/invokeai/backend/flux/denoise.py @@ -8,6 +8,7 @@ from invokeai.backend.flux.extensions.inpaint_extension import InpaintExtension from invokeai.backend.flux.extensions.instantx_controlnet_extension import InstantXControlNetExtension from invokeai.backend.flux.extensions.xlabs_controlnet_extension import XLabsControlNetExtension +from invokeai.backend.flux.extensions.xlabs_ip_adapter_extension import XLabsIPAdapterExtension from invokeai.backend.flux.model import Flux from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState @@ -32,6 +33,7 @@ def denoise( cfg_scale: list[float], inpaint_extension: InpaintExtension | None, controlnet_extensions: list[XLabsControlNetExtension | InstantXControlNetExtension], + ip_adapter_extensions: list[XLabsIPAdapterExtension], ): # step 0 is the initial state total_steps = len(timesteps) - 1 @@ -80,8 +82,11 @@ def denoise( y=vec, timesteps=t_vec, guidance=guidance_vec, + timestep_index=step_index, + total_num_timesteps=total_steps, controlnet_double_block_residuals=merged_controlnet_residuals.double_block_residuals, controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals, + ip_adapter_extensions=ip_adapter_extensions, ) step_cfg_scale = cfg_scale[step_index] diff --git a/invokeai/backend/flux/extensions/xlabs_ip_adapter_extension.py b/invokeai/backend/flux/extensions/xlabs_ip_adapter_extension.py new file mode 100644 index 00000000000..13ebb1451f2 --- /dev/null +++ b/invokeai/backend/flux/extensions/xlabs_ip_adapter_extension.py @@ -0,0 +1,89 @@ +import math +from typing import List, Union + +import einops +import torch +from PIL import Image +from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection + +from invokeai.backend.flux.ip_adapter.xlabs_ip_adapter_flux import XlabsIpAdapterFlux +from invokeai.backend.flux.modules.layers import DoubleStreamBlock + + +class XLabsIPAdapterExtension: + def __init__( + self, + model: XlabsIpAdapterFlux, + image_prompt_clip_embed: torch.Tensor, + weight: Union[float, List[float]], + begin_step_percent: float, + end_step_percent: float, + ): + self._model = model + self._image_prompt_clip_embed = image_prompt_clip_embed + self._weight = weight + self._begin_step_percent = begin_step_percent + self._end_step_percent = end_step_percent + + self._image_proj: torch.Tensor | None = None + + def _get_weight(self, timestep_index: int, total_num_timesteps: int) -> float: + first_step = math.floor(self._begin_step_percent * total_num_timesteps) + last_step = math.ceil(self._end_step_percent * total_num_timesteps) + + if timestep_index < first_step or timestep_index > last_step: + return 0.0 + + if isinstance(self._weight, list): + return self._weight[timestep_index] + + return self._weight + + @staticmethod + def run_clip_image_encoder( + pil_image: List[Image.Image], image_encoder: CLIPVisionModelWithProjection + ) -> torch.Tensor: + clip_image_processor = CLIPImageProcessor() + clip_image: torch.Tensor = clip_image_processor(images=pil_image, return_tensors="pt").pixel_values + clip_image = clip_image.to(device=image_encoder.device, dtype=image_encoder.dtype) + clip_image_embeds = image_encoder(clip_image).image_embeds + return clip_image_embeds + + def run_image_proj(self, dtype: torch.dtype): + image_prompt_clip_embed = self._image_prompt_clip_embed.to(dtype=dtype) + self._image_proj = self._model.image_proj(image_prompt_clip_embed) + + def run_ip_adapter( + self, + timestep_index: int, + total_num_timesteps: int, + block_index: int, + block: DoubleStreamBlock, + img_q: torch.Tensor, + img: torch.Tensor, + ) -> torch.Tensor: + """The logic in this function is based on: + https://github.com/XLabs-AI/x-flux/blob/47495425dbed499be1e8e5a6e52628b07349cba2/src/flux/modules/layers.py#L245-L301 + """ + weight = self._get_weight(timestep_index=timestep_index, total_num_timesteps=total_num_timesteps) + if weight < 1e-6: + return img + + ip_adapter_block = self._model.ip_adapter_double_blocks.double_blocks[block_index] + + ip_key = ip_adapter_block.ip_adapter_double_stream_k_proj(self._image_proj) + ip_value = ip_adapter_block.ip_adapter_double_stream_v_proj(self._image_proj) + + # Reshape projections for multi-head attention. + ip_key = einops.rearrange(ip_key, "B L (H D) -> B H L D", H=block.num_heads, D=block.head_dim) + ip_value = einops.rearrange(ip_value, "B L (H D) -> B H L D", H=block.num_heads, D=block.head_dim) + + # Compute attention between IP projections and the latent query. + ip_attn = torch.nn.functional.scaled_dot_product_attention( + img_q, ip_key, ip_value, dropout_p=0.0, is_causal=False + ) + ip_attn = einops.rearrange(ip_attn, "B H L D -> B L (H D)", H=block.num_heads, D=block.head_dim) + + img = img + weight * ip_attn + + return img diff --git a/invokeai/backend/flux/model.py b/invokeai/backend/flux/model.py index 3ec4c3922a2..0dadacd8fe1 100644 --- a/invokeai/backend/flux/model.py +++ b/invokeai/backend/flux/model.py @@ -5,6 +5,8 @@ import torch from torch import Tensor, nn +from invokeai.backend.flux.custom_block_processor import CustomDoubleStreamBlockProcessor +from invokeai.backend.flux.extensions.xlabs_ip_adapter_extension import XLabsIPAdapterExtension from invokeai.backend.flux.modules.layers import ( DoubleStreamBlock, EmbedND, @@ -88,8 +90,11 @@ def forward( timesteps: Tensor, y: Tensor, guidance: Tensor | None, + timestep_index: int, + total_num_timesteps: int, controlnet_double_block_residuals: list[Tensor] | None, controlnet_single_block_residuals: list[Tensor] | None, + ip_adapter_extensions: list[XLabsIPAdapterExtension], ) -> Tensor: if img.ndim != 3 or txt.ndim != 3: raise ValueError("Input img and txt tensors must have 3 dimensions.") @@ -111,7 +116,19 @@ def forward( if controlnet_double_block_residuals is not None: assert len(controlnet_double_block_residuals) == len(self.double_blocks) for block_index, block in enumerate(self.double_blocks): - img, txt = block(img=img, txt=txt, vec=vec, pe=pe) + assert isinstance(block, DoubleStreamBlock) + + img, txt = CustomDoubleStreamBlockProcessor.custom_double_block_forward( + timestep_index=timestep_index, + total_num_timesteps=total_num_timesteps, + block_index=block_index, + block=block, + img=img, + txt=txt, + vec=vec, + pe=pe, + ip_adapter_extensions=ip_adapter_extensions, + ) if controlnet_double_block_residuals is not None: img += controlnet_double_block_residuals[block_index] From fdccdd52d507817fd8a994a758a12425a3002934 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Wed, 16 Oct 2024 01:39:48 +0000 Subject: [PATCH 16/30] Fixes to get XLabsIpAdapterExtension running. --- .../backend/flux/extensions/xlabs_ip_adapter_extension.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/invokeai/backend/flux/extensions/xlabs_ip_adapter_extension.py b/invokeai/backend/flux/extensions/xlabs_ip_adapter_extension.py index 13ebb1451f2..b7a2bd85a6e 100644 --- a/invokeai/backend/flux/extensions/xlabs_ip_adapter_extension.py +++ b/invokeai/backend/flux/extensions/xlabs_ip_adapter_extension.py @@ -75,14 +75,14 @@ def run_ip_adapter( ip_value = ip_adapter_block.ip_adapter_double_stream_v_proj(self._image_proj) # Reshape projections for multi-head attention. - ip_key = einops.rearrange(ip_key, "B L (H D) -> B H L D", H=block.num_heads, D=block.head_dim) - ip_value = einops.rearrange(ip_value, "B L (H D) -> B H L D", H=block.num_heads, D=block.head_dim) + ip_key = einops.rearrange(ip_key, "B L (H D) -> B H L D", H=block.num_heads) + ip_value = einops.rearrange(ip_value, "B L (H D) -> B H L D", H=block.num_heads) # Compute attention between IP projections and the latent query. ip_attn = torch.nn.functional.scaled_dot_product_attention( img_q, ip_key, ip_value, dropout_p=0.0, is_causal=False ) - ip_attn = einops.rearrange(ip_attn, "B H L D -> B L (H D)", H=block.num_heads, D=block.head_dim) + ip_attn = einops.rearrange(ip_attn, "B H L D -> B L (H D)", H=block.num_heads) img = img + weight * ip_attn From f70a8e2c1a653084d07d5a9bf2454fdcb96157c6 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Wed, 16 Oct 2024 01:41:02 +0000 Subject: [PATCH 17/30] A bunch of HACKS to get ViT-L CLIP vision encoder working for FLUX IP-Adapter. Need to revisit how to clean this all up long term. --- invokeai/app/invocations/ip_adapter.py | 25 +++++++---- invokeai/app/invocations/metadata.py | 2 +- .../load/model_loaders/clip_vision.py | 41 +++++++++++++++++++ .../load/model_loaders/generic_diffusers.py | 1 - 4 files changed, 60 insertions(+), 9 deletions(-) create mode 100644 invokeai/backend/model_manager/load/model_loaders/clip_vision.py diff --git a/invokeai/app/invocations/ip_adapter.py b/invokeai/app/invocations/ip_adapter.py index de40879eef8..2f18da4530e 100644 --- a/invokeai/app/invocations/ip_adapter.py +++ b/invokeai/app/invocations/ip_adapter.py @@ -9,6 +9,7 @@ from invokeai.app.invocations.model import ModelIdentifierField from invokeai.app.invocations.primitives import ImageField from invokeai.app.invocations.util import validate_begin_end_step, validate_weights +from invokeai.app.services.model_records.model_records_base import ModelRecordChanges from invokeai.app.services.shared.invocation_context import InvocationContext from invokeai.backend.model_manager.config import ( AnyModelConfig, @@ -55,10 +56,14 @@ class IPAdapterOutput(BaseInvocationOutput): ip_adapter: IPAdapterField = OutputField(description=FieldDescriptions.ip_adapter, title="IP-Adapter") -CLIP_VISION_MODEL_MAP = {"ViT-H": "ip_adapter_sd_image_encoder", "ViT-G": "ip_adapter_sdxl_image_encoder"} +CLIP_VISION_MODEL_MAP = { + "ViT-L": ("InvokeAI/clip-vit-large-patch14", "clip-vit-large-patch14-full"), + "ViT-H": ("InvokeAI/ip_adapter_sd_image_encoder", "ip_adapter_sd_image_encoder"), + "ViT-G": ("InvokeAI/ip_adapter_sdxl_image_encoder", "ip_adapter_sdxl_image_encoder"), +} -@invocation("ip_adapter", title="IP-Adapter", tags=["ip_adapter", "control"], category="ip_adapter", version="1.4.1") +@invocation("ip_adapter", title="IP-Adapter", tags=["ip_adapter", "control"], category="ip_adapter", version="1.5.0") class IPAdapterInvocation(BaseInvocation): """Collects IP-Adapter info to pass to other nodes.""" @@ -70,7 +75,7 @@ class IPAdapterInvocation(BaseInvocation): ui_order=-1, ui_type=UIType.IPAdapterModel, ) - clip_vision_model: Literal["ViT-H", "ViT-G"] = InputField( + clip_vision_model: Literal["ViT-L", "ViT-H", "ViT-G"] = InputField( description="CLIP Vision model to use. Overrides model settings. Mandatory for checkpoint models.", default="ViT-H", ui_order=2, @@ -111,9 +116,9 @@ def invoke(self, context: InvocationContext) -> IPAdapterOutput: image_encoder_model_id = ip_adapter_info.image_encoder_model_id image_encoder_model_name = image_encoder_model_id.split("/")[-1].strip() else: - image_encoder_model_name = CLIP_VISION_MODEL_MAP[self.clip_vision_model] + image_encoder_model_id, image_encoder_model_name = CLIP_VISION_MODEL_MAP[self.clip_vision_model] - image_encoder_model = self._get_image_encoder(context, image_encoder_model_name) + image_encoder_model = self._get_image_encoder(context, image_encoder_model_id, image_encoder_model_name) if self.method == "style": if ip_adapter_info.base == "sd-1": @@ -147,7 +152,9 @@ def invoke(self, context: InvocationContext) -> IPAdapterOutput: ), ) - def _get_image_encoder(self, context: InvocationContext, image_encoder_model_name: str) -> AnyModelConfig: + def _get_image_encoder( + self, context: InvocationContext, image_encoder_model_id: str, image_encoder_model_name: str + ) -> AnyModelConfig: image_encoder_models = context.models.search_by_attrs( name=image_encoder_model_name, base=BaseModelType.Any, type=ModelType.CLIPVision ) @@ -159,7 +166,11 @@ def _get_image_encoder(self, context: InvocationContext, image_encoder_model_nam ) installer = context._services.model_manager.install - job = installer.heuristic_import(f"InvokeAI/{image_encoder_model_name}") + # Note: We hard-code the type to CLIPVision here because if the model contains both a CLIPVision and a + # CLIPText model, the probe may treat it as a CLIPText model. + job = installer.heuristic_import( + image_encoder_model_id, ModelRecordChanges(name=image_encoder_model_name, type=ModelType.CLIPVision) + ) installer.wait_for_job(job, timeout=600) # Wait for up to 10 minutes image_encoder_models = context.models.search_by_attrs( name=image_encoder_model_name, base=BaseModelType.Any, type=ModelType.CLIPVision diff --git a/invokeai/app/invocations/metadata.py b/invokeai/app/invocations/metadata.py index 19e75036035..c3142c824ae 100644 --- a/invokeai/app/invocations/metadata.py +++ b/invokeai/app/invocations/metadata.py @@ -40,7 +40,7 @@ class IPAdapterMetadataField(BaseModel): image: ImageField = Field(description="The IP-Adapter image prompt.") ip_adapter_model: ModelIdentifierField = Field(description="The IP-Adapter model.") - clip_vision_model: Literal["ViT-H", "ViT-G"] = Field(description="The CLIP Vision model") + clip_vision_model: Literal["ViT-L", "ViT-H", "ViT-G"] = Field(description="The CLIP Vision model") method: Literal["full", "style", "composition"] = Field(description="Method to apply IP Weights with") weight: Union[float, list[float]] = Field(description="The weight given to the IP-Adapter") begin_step_percent: float = Field(description="When the IP-Adapter is first applied (% of total steps)") diff --git a/invokeai/backend/model_manager/load/model_loaders/clip_vision.py b/invokeai/backend/model_manager/load/model_loaders/clip_vision.py new file mode 100644 index 00000000000..432e0f11756 --- /dev/null +++ b/invokeai/backend/model_manager/load/model_loaders/clip_vision.py @@ -0,0 +1,41 @@ +from pathlib import Path +from typing import Optional + +from transformers import CLIPVisionModelWithProjection + +from invokeai.backend.model_manager.config import ( + AnyModel, + AnyModelConfig, + BaseModelType, + DiffusersConfigBase, + ModelFormat, + ModelType, + SubModelType, +) +from invokeai.backend.model_manager.load.load_default import ModelLoader +from invokeai.backend.model_manager.load.model_loader_registry import ModelLoaderRegistry + + +@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.CLIPVision, format=ModelFormat.Diffusers) +class ClipVisionLoader(ModelLoader): + """Class to load CLIPVision models.""" + + def _load_model( + self, + config: AnyModelConfig, + submodel_type: Optional[SubModelType] = None, + ) -> AnyModel: + if not isinstance(config, DiffusersConfigBase): + raise ValueError("Only DiffusersConfigBase models are currently supported here.") + + if submodel_type is not None: + raise Exception(f"There are no submodels in models of type {model_class}") + + model_path = Path(config.path) + + model = CLIPVisionModelWithProjection.from_pretrained( + model_path, torch_dtype=self._torch_dtype, local_files_only=True + ) + assert isinstance(model, CLIPVisionModelWithProjection) + + return model diff --git a/invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py b/invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py index f1691ec4d4b..4ce51a56d04 100644 --- a/invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py +++ b/invokeai/backend/model_manager/load/model_loaders/generic_diffusers.py @@ -22,7 +22,6 @@ from invokeai.backend.model_manager.load.model_loader_registry import ModelLoaderRegistry -@ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.CLIPVision, format=ModelFormat.Diffusers) @ModelLoaderRegistry.register(base=BaseModelType.Any, type=ModelType.T2IAdapter, format=ModelFormat.Diffusers) class GenericDiffusersLoader(ModelLoader): """Class to load simple diffusers models.""" From dde54740c5773a1acb927ae554cdd6fbc772c581 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Wed, 16 Oct 2024 18:11:48 +0000 Subject: [PATCH 18/30] Test out IP-Adapter with CFG. --- invokeai/app/invocations/flux_denoise.py | 62 ++++++++++++++++-------- invokeai/backend/flux/denoise.py | 14 ++++-- 2 files changed, 51 insertions(+), 25 deletions(-) diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py index 1b7dea7b607..c3167cfd51d 100644 --- a/invokeai/app/invocations/flux_denoise.py +++ b/invokeai/app/invocations/flux_denoise.py @@ -4,7 +4,7 @@ import torch import torchvision.transforms as tv_transforms from torchvision.transforms.functional import resize as tv_resize -from transformers import CLIPVisionModelWithProjection +from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation from invokeai.app.invocations.fields import ( @@ -258,7 +258,9 @@ def _run_diffusion( # We do this before loading other models to minimize peak memory. # TODO(ryand): We should really do this in a separate invocation to benefit from caching. ip_adapter_fields = self._normalize_ip_adapter_fields() - image_prompt_clip_embeds = self._prep_ip_adapter_image_prompt_clip_embeds(ip_adapter_fields, context) + pos_image_prompt_clip_embeds, neg_image_prompt_clip_embeds = self._prep_ip_adapter_image_prompt_clip_embeds( + ip_adapter_fields, context + ) cfg_scale = self.prep_cfg_scale( cfg_scale=self.cfg_scale, @@ -316,8 +318,9 @@ def _run_diffusion( raise ValueError(f"Unsupported model format: {config.format}") # Prepare IP-Adapter extensions. - ip_adapter_extensions = self._prep_ip_adapter_extensions( - image_prompt_clip_embeds=image_prompt_clip_embeds, + pos_ip_adapter_extensions, neg_ip_adapter_extensions = self._prep_ip_adapter_extensions( + pos_image_prompt_clip_embeds=pos_image_prompt_clip_embeds, + neg_image_prompt_clip_embeds=neg_image_prompt_clip_embeds, ip_adapter_fields=ip_adapter_fields, context=context, exit_stack=exit_stack, @@ -340,7 +343,8 @@ def _run_diffusion( cfg_scale=cfg_scale, inpaint_extension=inpaint_extension, controlnet_extensions=controlnet_extensions, - ip_adapter_extensions=ip_adapter_extensions, + pos_ip_adapter_extensions=pos_ip_adapter_extensions, + neg_ip_adapter_extensions=neg_ip_adapter_extensions, ) x = unpack(x.float(), self.height, self.width) @@ -548,9 +552,12 @@ def _prep_ip_adapter_image_prompt_clip_embeds( self, ip_adapter_fields: list[IPAdapterField], context: InvocationContext, - ) -> list[torch.Tensor]: + ) -> tuple[list[torch.Tensor], list[torch.Tensor]]: """Run the IPAdapter CLIPVisionModel, returning image prompt embeddings.""" - image_prompt_clip_embeds: list[torch.Tensor] = [] + clip_image_processor = CLIPImageProcessor() + + pos_image_prompt_clip_embeds: list[torch.Tensor] = [] + neg_image_prompt_clip_embeds: list[torch.Tensor] = [] for ip_adapter_field in ip_adapter_fields: # `ip_adapter_field.image` could be a list or a single ImageField. Normalize to a list here. ipa_image_fields: list[ImageField] @@ -565,24 +572,30 @@ def _prep_ip_adapter_image_prompt_clip_embeds( with context.models.load(ip_adapter_field.image_encoder_model) as image_encoder_model: assert isinstance(image_encoder_model, CLIPVisionModelWithProjection) - image_prompt_clip_embeds.append( - XLabsIPAdapterExtension.run_clip_image_encoder( - pil_image=ipa_images, - image_encoder=image_encoder_model, - ) - ) - return image_prompt_clip_embeds + clip_image: torch.Tensor = clip_image_processor(images=ipa_images, return_tensors="pt").pixel_values + clip_image = clip_image.to(device=image_encoder_model.device, dtype=image_encoder_model.dtype) + pos_clip_image_embeds = image_encoder_model(clip_image).image_embeds + neg_clip_image_embeds = image_encoder_model(torch.zeros_like(clip_image)).image_embeds + + pos_image_prompt_clip_embeds.append(pos_clip_image_embeds) + neg_image_prompt_clip_embeds.append(neg_clip_image_embeds) + + return pos_image_prompt_clip_embeds, neg_image_prompt_clip_embeds def _prep_ip_adapter_extensions( self, ip_adapter_fields: list[IPAdapterField], - image_prompt_clip_embeds: list[torch.Tensor], + pos_image_prompt_clip_embeds: list[torch.Tensor], + neg_image_prompt_clip_embeds: list[torch.Tensor], context: InvocationContext, exit_stack: ExitStack, dtype: torch.dtype, ) -> list[XLabsIPAdapterExtension]: - ip_adapter_extensions: list[XLabsIPAdapterExtension] = [] - for ip_adapter_field, image_prompt_clip_embed in zip(ip_adapter_fields, image_prompt_clip_embeds, strict=True): + pos_ip_adapter_extensions: list[XLabsIPAdapterExtension] = [] + neg_ip_adapter_extensions: list[XLabsIPAdapterExtension] = [] + for ip_adapter_field, pos_image_prompt_clip_embed, neg_image_prompt_clip_embed in zip( + ip_adapter_fields, pos_image_prompt_clip_embeds, neg_image_prompt_clip_embeds, strict=True + ): ip_adapter_model = exit_stack.enter_context(context.models.load(ip_adapter_field.ip_adapter_model)) assert isinstance(ip_adapter_model, XlabsIpAdapterFlux) ip_adapter_model = ip_adapter_model.to(dtype=dtype) @@ -590,16 +603,25 @@ def _prep_ip_adapter_extensions( raise ValueError("IP-Adapter masks are not yet supported in Flux.") ip_adapter_extension = XLabsIPAdapterExtension( model=ip_adapter_model, - image_prompt_clip_embed=image_prompt_clip_embed, + image_prompt_clip_embed=pos_image_prompt_clip_embed, weight=ip_adapter_field.weight, begin_step_percent=ip_adapter_field.begin_step_percent, end_step_percent=ip_adapter_field.end_step_percent, ) + ip_adapter_extension.run_image_proj(dtype=dtype) + pos_ip_adapter_extensions.append(ip_adapter_extension) + ip_adapter_extension = XLabsIPAdapterExtension( + model=ip_adapter_model, + image_prompt_clip_embed=neg_image_prompt_clip_embed, + weight=ip_adapter_field.weight, + begin_step_percent=ip_adapter_field.begin_step_percent, + end_step_percent=ip_adapter_field.end_step_percent, + ) ip_adapter_extension.run_image_proj(dtype=dtype) - ip_adapter_extensions.append(ip_adapter_extension) + neg_ip_adapter_extensions.append(ip_adapter_extension) - return ip_adapter_extensions + return pos_ip_adapter_extensions, neg_ip_adapter_extensions def _lora_iterator(self, context: InvocationContext) -> Iterator[Tuple[LoRAModelRaw, float]]: for lora in self.transformer.loras: diff --git a/invokeai/backend/flux/denoise.py b/invokeai/backend/flux/denoise.py index 025586f4e02..bb0e60409a8 100644 --- a/invokeai/backend/flux/denoise.py +++ b/invokeai/backend/flux/denoise.py @@ -33,7 +33,8 @@ def denoise( cfg_scale: list[float], inpaint_extension: InpaintExtension | None, controlnet_extensions: list[XLabsControlNetExtension | InstantXControlNetExtension], - ip_adapter_extensions: list[XLabsIPAdapterExtension], + pos_ip_adapter_extensions: list[XLabsIPAdapterExtension], + neg_ip_adapter_extensions: list[XLabsIPAdapterExtension], ): # step 0 is the initial state total_steps = len(timesteps) - 1 @@ -69,7 +70,7 @@ def denoise( ) # Merge the ControlNet residuals from multiple ControlNets. - # TODO(ryand): We may want to alculate the sum just-in-time to keep peak memory low. Keep in mind, that the + # TODO(ryand): We may want to calculate the sum just-in-time to keep peak memory low. Keep in mind, that the # controlnet_residuals datastructure is efficient in that it likely contains multiple references to the same # tensors. Calculating the sum materializes each tensor into its own instance. merged_controlnet_residuals = sum_controlnet_flux_outputs(controlnet_residuals) @@ -86,15 +87,15 @@ def denoise( total_num_timesteps=total_steps, controlnet_double_block_residuals=merged_controlnet_residuals.double_block_residuals, controlnet_single_block_residuals=merged_controlnet_residuals.single_block_residuals, - ip_adapter_extensions=ip_adapter_extensions, + ip_adapter_extensions=pos_ip_adapter_extensions, ) step_cfg_scale = cfg_scale[step_index] # If step_cfg_scale, is 1.0, then we don't need to run the negative prediction. if not math.isclose(step_cfg_scale, 1.0): - # TODO(ryand): Add option to run positive and negative predictions in a single batch for better performance on - # systems with sufficient VRAM. + # TODO(ryand): Add option to run positive and negative predictions in a single batch for better performance + # on systems with sufficient VRAM. if neg_txt is None or neg_txt_ids is None or neg_vec is None: raise ValueError("Negative text conditioning is required when cfg_scale is not 1.0.") @@ -107,8 +108,11 @@ def denoise( y=neg_vec, timesteps=t_vec, guidance=guidance_vec, + timestep_index=step_index, + total_num_timesteps=total_steps, controlnet_double_block_residuals=None, controlnet_single_block_residuals=None, + ip_adapter_extensions=neg_ip_adapter_extensions, ) pred = neg_pred + step_cfg_scale * (pred - neg_pred) From 73bbb12f7a983bddbb264233e563b3c9ea5261b1 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Fri, 18 Oct 2024 18:52:12 +0000 Subject: [PATCH 19/30] Use a black image as the negative IP prompt for parity with X-Labs implementation. --- invokeai/app/invocations/flux_denoise.py | 24 +++++++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py index c3167cfd51d..e5413d05520 100644 --- a/invokeai/app/invocations/flux_denoise.py +++ b/invokeai/app/invocations/flux_denoise.py @@ -1,6 +1,8 @@ from contextlib import ExitStack from typing import Callable, Iterator, Optional, Tuple +import numpy as np +import numpy.typing as npt import torch import torchvision.transforms as tv_transforms from torchvision.transforms.functional import resize as tv_resize @@ -570,12 +572,28 @@ def _prep_ip_adapter_image_prompt_clip_embeds( ipa_images = [context.images.get_pil(image.image_name) for image in ipa_image_fields] + pos_images: list[npt.NDArray[np.uint8]] = [] + neg_images: list[npt.NDArray[np.uint8]] = [] + for ipa_image in ipa_images: + assert ipa_image.mode == "RGB" + pos_image = np.array(ipa_image) + # We use a black image as the negative image prompt for parity with + # https://github.com/XLabs-AI/x-flux-comfyui/blob/45c834727dd2141aebc505ae4b01f193a8414e38/nodes.py#L592-L593 + # An alternative scheme would be to apply zeros_like() after calling the clip_image_processor. + neg_image = np.zeros_like(pos_image) + pos_images.append(pos_image) + neg_images.append(neg_image) + with context.models.load(ip_adapter_field.image_encoder_model) as image_encoder_model: assert isinstance(image_encoder_model, CLIPVisionModelWithProjection) - clip_image: torch.Tensor = clip_image_processor(images=ipa_images, return_tensors="pt").pixel_values + + clip_image: torch.Tensor = clip_image_processor(images=pos_images, return_tensors="pt").pixel_values clip_image = clip_image.to(device=image_encoder_model.device, dtype=image_encoder_model.dtype) pos_clip_image_embeds = image_encoder_model(clip_image).image_embeds - neg_clip_image_embeds = image_encoder_model(torch.zeros_like(clip_image)).image_embeds + + clip_image = clip_image_processor(images=neg_images, return_tensors="pt").pixel_values + clip_image = clip_image.to(device=image_encoder_model.device, dtype=image_encoder_model.dtype) + neg_clip_image_embeds = image_encoder_model(clip_image).image_embeds pos_image_prompt_clip_embeds.append(pos_clip_image_embeds) neg_image_prompt_clip_embeds.append(neg_clip_image_embeds) @@ -590,7 +608,7 @@ def _prep_ip_adapter_extensions( context: InvocationContext, exit_stack: ExitStack, dtype: torch.dtype, - ) -> list[XLabsIPAdapterExtension]: + ) -> tuple[list[XLabsIPAdapterExtension], list[XLabsIPAdapterExtension]]: pos_ip_adapter_extensions: list[XLabsIPAdapterExtension] = [] neg_ip_adapter_extensions: list[XLabsIPAdapterExtension] = [] for ip_adapter_field, pos_image_prompt_clip_embed, neg_image_prompt_clip_embed in zip( From 554611012785383ce27d573be66676b07fa1eb88 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Mon, 21 Oct 2024 18:23:12 +0000 Subject: [PATCH 20/30] Add FluxIPAdapterInvocation. --- invokeai/app/invocations/flux_ip_adapter.py | 94 +++++++++++++++++++++ invokeai/app/invocations/ip_adapter.py | 11 +-- 2 files changed, 100 insertions(+), 5 deletions(-) create mode 100644 invokeai/app/invocations/flux_ip_adapter.py diff --git a/invokeai/app/invocations/flux_ip_adapter.py b/invokeai/app/invocations/flux_ip_adapter.py new file mode 100644 index 00000000000..d44006500a4 --- /dev/null +++ b/invokeai/app/invocations/flux_ip_adapter.py @@ -0,0 +1,94 @@ +from builtins import float +from typing import List, Literal, Union + +from pydantic import field_validator, model_validator +from typing_extensions import Self + +from invokeai.app.invocations.baseinvocation import BaseInvocation, Classification, invocation +from invokeai.app.invocations.fields import InputField, UIType +from invokeai.app.invocations.ip_adapter import ( + CLIP_VISION_MODEL_MAP, + IPAdapterField, + IPAdapterInvocation, + IPAdapterOutput, +) +from invokeai.app.invocations.model import ModelIdentifierField +from invokeai.app.invocations.primitives import ImageField +from invokeai.app.invocations.util import validate_begin_end_step, validate_weights +from invokeai.app.services.shared.invocation_context import InvocationContext +from invokeai.backend.model_manager.config import ( + IPAdapterCheckpointConfig, + IPAdapterInvokeAIConfig, +) + + +@invocation( + "flux_ip_adapter", + title="FLUX IP-Adapter", + tags=["ip_adapter", "control"], + category="ip_adapter", + version="1.0.0", + classification=Classification.Prototype, +) +class FluxIPAdapterInvocation(BaseInvocation): + """Collects FLUX IP-Adapter info to pass to other nodes.""" + + # FLUXIPAdapterInvocation is based closely on IPAdapterInvocation, but with some unsupported features removed. + + image: Union[ImageField, List[ImageField]] = InputField(description="The IP-Adapter image prompt(s).") + ip_adapter_model: ModelIdentifierField = InputField( + description="The IP-Adapter model.", title="IP-Adapter Model", ui_type=UIType.IPAdapterModel + ) + clip_vision_model: Literal["ViT-L"] = InputField( + description="CLIP Vision model to use. Only applied if the correct CLIP Vision model cannot be detected from " + + "the model config.", + default="ViT-L", + ) + weight: Union[float, List[float]] = InputField( + default=1, description="The weight given to the IP-Adapter", title="Weight" + ) + begin_step_percent: float = InputField( + default=0, ge=0, le=1, description="When the IP-Adapter is first applied (% of total steps)" + ) + end_step_percent: float = InputField( + default=1, ge=0, le=1, description="When the IP-Adapter is last applied (% of total steps)" + ) + + @field_validator("weight") + @classmethod + def validate_ip_adapter_weight(cls, v: float) -> float: + validate_weights(v) + return v + + @model_validator(mode="after") + def validate_begin_end_step_percent(self) -> Self: + validate_begin_end_step(self.begin_step_percent, self.end_step_percent) + return self + + def invoke(self, context: InvocationContext) -> IPAdapterOutput: + # Lookup the CLIP Vision encoder that is intended to be used with the IP-Adapter model. + ip_adapter_info = context.models.get_config(self.ip_adapter_model.key) + assert isinstance(ip_adapter_info, (IPAdapterInvokeAIConfig, IPAdapterCheckpointConfig)) + + if isinstance(ip_adapter_info, IPAdapterInvokeAIConfig): + image_encoder_model_id = ip_adapter_info.image_encoder_model_id + image_encoder_model_name = image_encoder_model_id.split("/")[-1].strip() + else: + image_encoder_model_id, image_encoder_model_name = CLIP_VISION_MODEL_MAP[self.clip_vision_model] + + image_encoder_model = IPAdapterInvocation.get_clip_image_encoder( + context, image_encoder_model_id, image_encoder_model_name + ) + + return IPAdapterOutput( + ip_adapter=IPAdapterField( + image=self.image, + ip_adapter_model=self.ip_adapter_model, + image_encoder_model=ModelIdentifierField.from_config(image_encoder_model), + weight=self.weight, + target_blocks=[], # target_blocks is currently unused for FLUX IP-Adapters. + begin_step_percent=self.begin_step_percent, + end_step_percent=self.end_step_percent, + mask=None, # mask is currently unused for FLUX IP-Adapters. + ), + ) diff --git a/invokeai/app/invocations/ip_adapter.py b/invokeai/app/invocations/ip_adapter.py index 2f18da4530e..63624398700 100644 --- a/invokeai/app/invocations/ip_adapter.py +++ b/invokeai/app/invocations/ip_adapter.py @@ -57,7 +57,7 @@ class IPAdapterOutput(BaseInvocationOutput): CLIP_VISION_MODEL_MAP = { - "ViT-L": ("InvokeAI/clip-vit-large-patch14", "clip-vit-large-patch14-full"), + "ViT-L": ("InvokeAI/clip-vit-large-patch14", "clip-vit-large-patch14"), "ViT-H": ("InvokeAI/ip_adapter_sd_image_encoder", "ip_adapter_sd_image_encoder"), "ViT-G": ("InvokeAI/ip_adapter_sdxl_image_encoder", "ip_adapter_sdxl_image_encoder"), } @@ -75,7 +75,7 @@ class IPAdapterInvocation(BaseInvocation): ui_order=-1, ui_type=UIType.IPAdapterModel, ) - clip_vision_model: Literal["ViT-L", "ViT-H", "ViT-G"] = InputField( + clip_vision_model: Literal["ViT-H", "ViT-G"] = InputField( description="CLIP Vision model to use. Overrides model settings. Mandatory for checkpoint models.", default="ViT-H", ui_order=2, @@ -118,7 +118,7 @@ def invoke(self, context: InvocationContext) -> IPAdapterOutput: else: image_encoder_model_id, image_encoder_model_name = CLIP_VISION_MODEL_MAP[self.clip_vision_model] - image_encoder_model = self._get_image_encoder(context, image_encoder_model_id, image_encoder_model_name) + image_encoder_model = self.get_clip_image_encoder(context, image_encoder_model_id, image_encoder_model_name) if self.method == "style": if ip_adapter_info.base == "sd-1": @@ -152,8 +152,9 @@ def invoke(self, context: InvocationContext) -> IPAdapterOutput: ), ) - def _get_image_encoder( - self, context: InvocationContext, image_encoder_model_id: str, image_encoder_model_name: str + @classmethod + def get_clip_image_encoder( + cls, context: InvocationContext, image_encoder_model_id: str, image_encoder_model_name: str ) -> AnyModelConfig: image_encoder_models = context.models.search_by_attrs( name=image_encoder_model_name, base=BaseModelType.Any, type=ModelType.CLIPVision From 90a906e203ff029f290f7b6d6caced0c021d51f5 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Mon, 21 Oct 2024 19:54:21 +0000 Subject: [PATCH 21/30] Simplify handling of CLIP ViT selection for FLUX IP-Adapter invocation. --- invokeai/app/invocations/flux_ip_adapter.py | 14 ++++---------- invokeai/backend/model_manager/config.py | 2 ++ 2 files changed, 6 insertions(+), 10 deletions(-) diff --git a/invokeai/app/invocations/flux_ip_adapter.py b/invokeai/app/invocations/flux_ip_adapter.py index d44006500a4..556c5703f28 100644 --- a/invokeai/app/invocations/flux_ip_adapter.py +++ b/invokeai/app/invocations/flux_ip_adapter.py @@ -39,11 +39,8 @@ class FluxIPAdapterInvocation(BaseInvocation): ip_adapter_model: ModelIdentifierField = InputField( description="The IP-Adapter model.", title="IP-Adapter Model", ui_type=UIType.IPAdapterModel ) - clip_vision_model: Literal["ViT-L"] = InputField( - description="CLIP Vision model to use. Only applied if the correct CLIP Vision model cannot be detected from " - + "the model config.", - default="ViT-L", - ) + # Currently, the only known ViT model used by FLUX IP-Adapters is ViT-L. + clip_vision_model: Literal["ViT-L"] = InputField(description="CLIP Vision model to use.", default="ViT-L") weight: Union[float, List[float]] = InputField( default=1, description="The weight given to the IP-Adapter", title="Weight" ) @@ -70,11 +67,8 @@ def invoke(self, context: InvocationContext) -> IPAdapterOutput: ip_adapter_info = context.models.get_config(self.ip_adapter_model.key) assert isinstance(ip_adapter_info, (IPAdapterInvokeAIConfig, IPAdapterCheckpointConfig)) - if isinstance(ip_adapter_info, IPAdapterInvokeAIConfig): - image_encoder_model_id = ip_adapter_info.image_encoder_model_id - image_encoder_model_name = image_encoder_model_id.split("/")[-1].strip() - else: - image_encoder_model_id, image_encoder_model_name = CLIP_VISION_MODEL_MAP[self.clip_vision_model] + # Note: There is a IPAdapterInvokeAIConfig.image_encoder_model_id field, but it isn't trustworthy. + image_encoder_model_id, image_encoder_model_name = CLIP_VISION_MODEL_MAP[self.clip_vision_model] image_encoder_model = IPAdapterInvocation.get_clip_image_encoder( context, image_encoder_model_id, image_encoder_model_name diff --git a/invokeai/backend/model_manager/config.py b/invokeai/backend/model_manager/config.py index f1c262df994..ab1ee46e9ff 100644 --- a/invokeai/backend/model_manager/config.py +++ b/invokeai/backend/model_manager/config.py @@ -394,6 +394,8 @@ class IPAdapterBaseConfig(ModelConfigBase): class IPAdapterInvokeAIConfig(IPAdapterBaseConfig): """Model config for IP Adapter diffusers format models.""" + # TODO(ryand): Should we deprecate this field? From what I can tell, it hasn't been probed correctly for a long + # time. Need to go through the history to make sure I'm understanding this fully. image_encoder_model_id: str format: Literal[ModelFormat.InvokeAI] From e8cd1bb3d88cbb84cae12b6a58ae84a86f19a866 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Mon, 21 Oct 2024 22:17:42 +0000 Subject: [PATCH 22/30] Add FLUX IP-Adapter starter models. --- invokeai/app/invocations/flux_ip_adapter.py | 5 +-- invokeai/app/invocations/ip_adapter.py | 18 +++++++--- .../backend/model_manager/starter_models.py | 36 ++++++++++++++----- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/invokeai/app/invocations/flux_ip_adapter.py b/invokeai/app/invocations/flux_ip_adapter.py index 556c5703f28..1b342b3c7a3 100644 --- a/invokeai/app/invocations/flux_ip_adapter.py +++ b/invokeai/app/invocations/flux_ip_adapter.py @@ -68,8 +68,9 @@ def invoke(self, context: InvocationContext) -> IPAdapterOutput: assert isinstance(ip_adapter_info, (IPAdapterInvokeAIConfig, IPAdapterCheckpointConfig)) # Note: There is a IPAdapterInvokeAIConfig.image_encoder_model_id field, but it isn't trustworthy. - image_encoder_model_id, image_encoder_model_name = CLIP_VISION_MODEL_MAP[self.clip_vision_model] - + image_encoder_starter_model = CLIP_VISION_MODEL_MAP[self.clip_vision_model] + image_encoder_model_id = image_encoder_starter_model.source + image_encoder_model_name = image_encoder_starter_model.name image_encoder_model = IPAdapterInvocation.get_clip_image_encoder( context, image_encoder_model_id, image_encoder_model_name ) diff --git a/invokeai/app/invocations/ip_adapter.py b/invokeai/app/invocations/ip_adapter.py index 63624398700..361e8f0a71b 100644 --- a/invokeai/app/invocations/ip_adapter.py +++ b/invokeai/app/invocations/ip_adapter.py @@ -18,6 +18,12 @@ IPAdapterInvokeAIConfig, ModelType, ) +from invokeai.backend.model_manager.starter_models import ( + StarterModel, + clip_vit_l_image_encoder, + ip_adapter_sd_image_encoder, + ip_adapter_sdxl_image_encoder, +) class IPAdapterField(BaseModel): @@ -56,10 +62,10 @@ class IPAdapterOutput(BaseInvocationOutput): ip_adapter: IPAdapterField = OutputField(description=FieldDescriptions.ip_adapter, title="IP-Adapter") -CLIP_VISION_MODEL_MAP = { - "ViT-L": ("InvokeAI/clip-vit-large-patch14", "clip-vit-large-patch14"), - "ViT-H": ("InvokeAI/ip_adapter_sd_image_encoder", "ip_adapter_sd_image_encoder"), - "ViT-G": ("InvokeAI/ip_adapter_sdxl_image_encoder", "ip_adapter_sdxl_image_encoder"), +CLIP_VISION_MODEL_MAP: dict[Literal["ViT-L", "ViT-H", "ViT-G"], StarterModel] = { + "ViT-L": clip_vit_l_image_encoder, + "ViT-H": ip_adapter_sd_image_encoder, + "ViT-G": ip_adapter_sdxl_image_encoder, } @@ -116,7 +122,9 @@ def invoke(self, context: InvocationContext) -> IPAdapterOutput: image_encoder_model_id = ip_adapter_info.image_encoder_model_id image_encoder_model_name = image_encoder_model_id.split("/")[-1].strip() else: - image_encoder_model_id, image_encoder_model_name = CLIP_VISION_MODEL_MAP[self.clip_vision_model] + image_encoder_starter_model = CLIP_VISION_MODEL_MAP[self.clip_vision_model] + image_encoder_model_id = image_encoder_starter_model.source + image_encoder_model_name = image_encoder_starter_model.name image_encoder_model = self.get_clip_image_encoder(context, image_encoder_model_id, image_encoder_model_name) diff --git a/invokeai/backend/model_manager/starter_models.py b/invokeai/backend/model_manager/starter_models.py index 66568f0a0dd..05b61d35e5b 100644 --- a/invokeai/backend/model_manager/starter_models.py +++ b/invokeai/backend/model_manager/starter_models.py @@ -25,6 +25,15 @@ class StarterModelBundles(BaseModel): models: list[StarterModel] +cyberrealistic_negative = StarterModel( + name="CyberRealistic Negative v3", + base=BaseModelType.StableDiffusion1, + source="https://huggingface.co/cyberdelia/CyberRealistic_Negative/resolve/main/CyberRealistic_Negative_v3.pt", + description="Negative embedding specifically for use with CyberRealistic.", + type=ModelType.TextualInversion, +) + +# region CLIP Image Encoders ip_adapter_sd_image_encoder = StarterModel( name="IP Adapter SD1.5 Image Encoder", base=BaseModelType.StableDiffusion1, @@ -32,7 +41,6 @@ class StarterModelBundles(BaseModel): description="IP Adapter SD Image Encoder", type=ModelType.CLIPVision, ) - ip_adapter_sdxl_image_encoder = StarterModel( name="IP Adapter SDXL Image Encoder", base=BaseModelType.StableDiffusionXL, @@ -40,14 +48,16 @@ class StarterModelBundles(BaseModel): description="IP Adapter SDXL Image Encoder", type=ModelType.CLIPVision, ) - -cyberrealistic_negative = StarterModel( - name="CyberRealistic Negative v3", - base=BaseModelType.StableDiffusion1, - source="https://huggingface.co/cyberdelia/CyberRealistic_Negative/resolve/main/CyberRealistic_Negative_v3.pt", - description="Negative embedding specifically for use with CyberRealistic.", - type=ModelType.TextualInversion, +# Note: This model is installed from the same source as the CLIPEmbed model below. The model contains both the image +# encoder and the text encoder, but we need separate model entries so that they get loaded correctly. +clip_vit_l_image_encoder = StarterModel( + name="clip-vit-large-patch14", + base=BaseModelType.Any, + source="InvokeAI/clip-vit-large-patch14", + description="CLIP ViT-L Image Encoder", + type=ModelType.CLIPVision, ) +# endregion # region TextEncoders t5_base_encoder = StarterModel( @@ -254,6 +264,14 @@ class StarterModelBundles(BaseModel): type=ModelType.IPAdapter, dependencies=[ip_adapter_sdxl_image_encoder], ) +ip_adapter_flux = StarterModel( + name="XLabs FLUX IP-Adapter", + base=BaseModelType.Flux, + source="https://huggingface.co/XLabs-AI/flux-ip-adapter/resolve/main/flux-ip-adapter.safetensors", + description="FLUX IP-Adapter", + type=ModelType.IPAdapter, + dependencies=[clip_vit_l_image_encoder], +) # endregion # region ControlNet qr_code_cnet_sd1 = StarterModel( @@ -555,6 +573,7 @@ class StarterModelBundles(BaseModel): ip_adapter_plus_sd1, ip_adapter_plus_face_sd1, ip_adapter_sdxl, + ip_adapter_flux, qr_code_cnet_sd1, qr_code_cnet_sdxl, canny_sd1, @@ -642,6 +661,7 @@ class StarterModelBundles(BaseModel): t5_8b_quantized_encoder, clip_l_encoder, union_cnet_flux, + ip_adapter_flux, ] STARTER_BUNDLES: dict[str, list[StarterModel]] = { From e545f18a45b17b9a6bcc7afb53d71c4e57b5894e Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Mon, 21 Oct 2024 22:38:06 +0000 Subject: [PATCH 23/30] (minor) Fix ruff. --- .../backend/model_manager/load/model_loaders/clip_vision.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/invokeai/backend/model_manager/load/model_loaders/clip_vision.py b/invokeai/backend/model_manager/load/model_loaders/clip_vision.py index 432e0f11756..cef1c962f9a 100644 --- a/invokeai/backend/model_manager/load/model_loaders/clip_vision.py +++ b/invokeai/backend/model_manager/load/model_loaders/clip_vision.py @@ -29,7 +29,7 @@ def _load_model( raise ValueError("Only DiffusersConfigBase models are currently supported here.") if submodel_type is not None: - raise Exception(f"There are no submodels in models of type {model_class}") + raise Exception("There are no submodels in CLIP Vision models.") model_path = Path(config.path) From 740f6eb19f8fc9def544cb96e7bc5535fb1b22ad Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 22 Oct 2024 15:56:49 +0000 Subject: [PATCH 24/30] Skip tests that use the meta device - they fail on the MacOS CI runners. --- tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py b/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py index 1c88304ea13..93012684b7e 100644 --- a/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py +++ b/tests/backend/flux/ip_adapter/test_xlabs_ip_adapter_flux.py @@ -1,4 +1,7 @@ +import sys + import accelerate +import pytest import torch from invokeai.backend.flux.ip_adapter.state_dict_utils import ( @@ -18,6 +21,7 @@ def test_is_state_dict_xlabs_ip_adapter(): assert is_state_dict_xlabs_ip_adapter(sd) +@pytest.mark.skipif(sys.platform == "darwin", reason="Skipping on macOS") def test_infer_xlabs_ip_adapter_params_from_state_dict(): # Construct a dummy state_dict with tensors of the correct shape on the meta device. with torch.device("meta"): @@ -31,6 +35,7 @@ def test_infer_xlabs_ip_adapter_params_from_state_dict(): assert params.clip_embeddings_dim == 768 +@pytest.mark.skipif(sys.platform == "darwin", reason="Skipping on macOS") def test_initialize_xlabs_ip_adapter_flux_from_state_dict(): # Construct a dummy state_dict with tensors of the correct shape on the meta device. with torch.device("meta"): From e48cab02768b039db121cfb27c924544bd7e6532 Mon Sep 17 00:00:00 2001 From: Ryan Dick Date: Tue, 22 Oct 2024 16:32:01 +0000 Subject: [PATCH 25/30] Only allow a single image prompt for FLUX IP-Adapters (haven't really looked into this much, but punting on it for now). --- invokeai/app/invocations/flux_denoise.py | 5 +++++ invokeai/app/invocations/flux_ip_adapter.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py index e5413d05520..27f8ee02858 100644 --- a/invokeai/app/invocations/flux_denoise.py +++ b/invokeai/app/invocations/flux_denoise.py @@ -570,6 +570,11 @@ def _prep_ip_adapter_image_prompt_clip_embeds( else: raise ValueError(f"Unsupported IP-Adapter image type: {type(ip_adapter_field.image)}") + if len(ipa_image_fields) != 1: + raise ValueError( + f"FLUX IP-Adapter only supports a single image prompt (received {len(ipa_image_fields)})." + ) + ipa_images = [context.images.get_pil(image.image_name) for image in ipa_image_fields] pos_images: list[npt.NDArray[np.uint8]] = [] diff --git a/invokeai/app/invocations/flux_ip_adapter.py b/invokeai/app/invocations/flux_ip_adapter.py index 1b342b3c7a3..9653f859ad0 100644 --- a/invokeai/app/invocations/flux_ip_adapter.py +++ b/invokeai/app/invocations/flux_ip_adapter.py @@ -35,7 +35,7 @@ class FluxIPAdapterInvocation(BaseInvocation): # FLUXIPAdapterInvocation is based closely on IPAdapterInvocation, but with some unsupported features removed. - image: Union[ImageField, List[ImageField]] = InputField(description="The IP-Adapter image prompt(s).") + image: ImageField = InputField(description="The IP-Adapter image prompt(s).") ip_adapter_model: ModelIdentifierField = InputField( description="The IP-Adapter model.", title="IP-Adapter Model", ui_type=UIType.IPAdapterModel ) From 0a96466b60bbf610a79287c2ff7199273b28a032 Mon Sep 17 00:00:00 2001 From: Mary Hipp Date: Tue, 22 Oct 2024 15:22:56 -0400 Subject: [PATCH 26/30] feat(ui): add IP adapters to FLUX in linear UI --- invokeai/app/invocations/ip_adapter.py | 2 +- .../components/CanvasAddEntityButtons.tsx | 1 - .../EntityListGlobalActionBarAddLayerMenu.tsx | 2 +- .../components/IPAdapter/IPAdapterModel.tsx | 28 ++++- .../IPAdapter/IPAdapterSettings.tsx | 5 +- .../src/features/controlLayers/store/types.ts | 2 +- .../util/graph/generation/buildFLUXGraph.ts | 35 ++++++ .../frontend/web/src/services/api/schema.ts | 112 ++++++++++++++++-- 8 files changed, 169 insertions(+), 18 deletions(-) diff --git a/invokeai/app/invocations/ip_adapter.py b/invokeai/app/invocations/ip_adapter.py index 361e8f0a71b..e3d92374c75 100644 --- a/invokeai/app/invocations/ip_adapter.py +++ b/invokeai/app/invocations/ip_adapter.py @@ -81,7 +81,7 @@ class IPAdapterInvocation(BaseInvocation): ui_order=-1, ui_type=UIType.IPAdapterModel, ) - clip_vision_model: Literal["ViT-H", "ViT-G"] = InputField( + clip_vision_model: Literal["ViT-H", "ViT-G", "ViT-L"] = InputField( description="CLIP Vision model to use. Overrides model settings. Mandatory for checkpoint models.", default="ViT-H", ui_order=2, diff --git a/invokeai/frontend/web/src/features/controlLayers/components/CanvasAddEntityButtons.tsx b/invokeai/frontend/web/src/features/controlLayers/components/CanvasAddEntityButtons.tsx index 76c7d88fdbd..4fc2fb8b347 100644 --- a/invokeai/frontend/web/src/features/controlLayers/components/CanvasAddEntityButtons.tsx +++ b/invokeai/frontend/web/src/features/controlLayers/components/CanvasAddEntityButtons.tsx @@ -34,7 +34,6 @@ export const CanvasAddEntityButtons = memo(() => { justifyContent="flex-start" leftIcon={} onClick={addGlobalReferenceImage} - isDisabled={isFLUX} > {t('controlLayers.globalReferenceImage')} diff --git a/invokeai/frontend/web/src/features/controlLayers/components/CanvasEntityList/EntityListGlobalActionBarAddLayerMenu.tsx b/invokeai/frontend/web/src/features/controlLayers/components/CanvasEntityList/EntityListGlobalActionBarAddLayerMenu.tsx index 7a9cf30f678..ba5c8e6d037 100644 --- a/invokeai/frontend/web/src/features/controlLayers/components/CanvasEntityList/EntityListGlobalActionBarAddLayerMenu.tsx +++ b/invokeai/frontend/web/src/features/controlLayers/components/CanvasEntityList/EntityListGlobalActionBarAddLayerMenu.tsx @@ -40,7 +40,7 @@ export const EntityListGlobalActionBarAddLayerMenu = memo(() => { /> - } onClick={addGlobalReferenceImage} isDisabled={isFLUX}> + } onClick={addGlobalReferenceImage}> {t('controlLayers.globalReferenceImage')} diff --git a/invokeai/frontend/web/src/features/controlLayers/components/IPAdapter/IPAdapterModel.tsx b/invokeai/frontend/web/src/features/controlLayers/components/IPAdapter/IPAdapterModel.tsx index 5b5add0b854..218582535f3 100644 --- a/invokeai/frontend/web/src/features/controlLayers/components/IPAdapter/IPAdapterModel.tsx +++ b/invokeai/frontend/web/src/features/controlLayers/components/IPAdapter/IPAdapterModel.tsx @@ -2,7 +2,7 @@ import type { ComboboxOnChange } from '@invoke-ai/ui-library'; import { Combobox, Flex, FormControl, Tooltip } from '@invoke-ai/ui-library'; import { useAppSelector } from 'app/store/storeHooks'; import { useGroupedModelCombobox } from 'common/hooks/useGroupedModelCombobox'; -import { selectBase } from 'features/controlLayers/store/paramsSlice'; +import { selectBase, selectIsFLUX } from 'features/controlLayers/store/paramsSlice'; import type { CLIPVisionModelV2 } from 'features/controlLayers/store/types'; import { isCLIPVisionModelV2 } from 'features/controlLayers/store/types'; import { memo, useCallback, useMemo } from 'react'; @@ -11,9 +11,13 @@ import { useIPAdapterModels } from 'services/api/hooks/modelsByType'; import type { AnyModelConfig, IPAdapterModelConfig } from 'services/api/types'; import { assert } from 'tsafe'; +// at this time, ViT-L is the only supported clip model for FLUX IP adapter +const FLUX_CLIP_VISION = 'ViT-L'; + const CLIP_VISION_OPTIONS = [ { label: 'ViT-H', value: 'ViT-H' }, { label: 'ViT-G', value: 'ViT-G' }, + { label: FLUX_CLIP_VISION, value: FLUX_CLIP_VISION }, ]; type Props = { @@ -47,6 +51,8 @@ export const IPAdapterModel = memo(({ modelKey, onChangeModel, clipVisionModel, [onChangeCLIPVisionModel] ); + const isFLUX = useAppSelector(selectIsFLUX); + const getIsDisabled = useCallback( (model: AnyModelConfig): boolean => { const isCompatible = currentBaseModel === model.base; @@ -64,10 +70,20 @@ export const IPAdapterModel = memo(({ modelKey, onChangeModel, clipVisionModel, isLoading, }); - const clipVisionModelValue = useMemo( - () => CLIP_VISION_OPTIONS.find((o) => o.value === clipVisionModel), - [clipVisionModel] - ); + const clipVisionOptions = useMemo(() => { + if (isFLUX) { + return CLIP_VISION_OPTIONS.map((option) => ({ ...option, isDisabled: option.value !== FLUX_CLIP_VISION })); + } else { + return CLIP_VISION_OPTIONS; + } + }, [isFLUX]); + + const clipVisionModelValue = useMemo(() => { + if (isFLUX) { + return CLIP_VISION_OPTIONS.find((o) => o.value === FLUX_CLIP_VISION); + } + return CLIP_VISION_OPTIONS.find((o) => o.value === clipVisionModel); + }, [clipVisionModel, isFLUX]); return ( @@ -85,7 +101,7 @@ export const IPAdapterModel = memo(({ modelKey, onChangeModel, clipVisionModel, {selectedModel?.format === 'checkpoint' && ( { const pullBboxIntoIPAdapter = usePullBboxIntoGlobalReferenceImage(entityIdentifier); const isBusy = useCanvasIsBusy(); + const isFLUX = useAppSelector(selectIsFLUX); + return ( @@ -113,7 +116,7 @@ export const IPAdapterSettings = memo(() => { - + {!isFLUX && } diff --git a/invokeai/frontend/web/src/features/controlLayers/store/types.ts b/invokeai/frontend/web/src/features/controlLayers/store/types.ts index aacfd630d53..1905b98cede 100644 --- a/invokeai/frontend/web/src/features/controlLayers/store/types.ts +++ b/invokeai/frontend/web/src/features/controlLayers/store/types.ts @@ -46,7 +46,7 @@ const zControlModeV2 = z.enum(['balanced', 'more_prompt', 'more_control', 'unbal export type ControlModeV2 = z.infer; export const isControlModeV2 = (v: unknown): v is ControlModeV2 => zControlModeV2.safeParse(v).success; -const zCLIPVisionModelV2 = z.enum(['ViT-H', 'ViT-G']); +const zCLIPVisionModelV2 = z.enum(['ViT-H', 'ViT-G', 'ViT-L']); export type CLIPVisionModelV2 = z.infer; export const isCLIPVisionModelV2 = (v: unknown): v is CLIPVisionModelV2 => zCLIPVisionModelV2.safeParse(v).success; diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts index cc8a7347fe2..88e99ae8ea1 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/buildFLUXGraph.ts @@ -20,6 +20,7 @@ import { isNonRefinerMainModelConfig } from 'services/api/types'; import { assert } from 'tsafe'; import { addControlNets } from './addControlAdapters'; +import { addIPAdapters } from './addIPAdapters'; const log = logger('system'); @@ -198,6 +199,40 @@ export const buildFLUXGraph = async ( g.deleteNode(controlNetCollector.id); } + const ipAdapterCollector = g.addNode({ + type: 'collect', + id: getPrefixedId('ip_adapter_collector'), + }); + const ipAdapterResult = addIPAdapters(canvas.referenceImages.entities, g, ipAdapterCollector, modelConfig.base); + + const totalIPAdaptersAdded = ipAdapterResult.addedIPAdapters; + if (totalIPAdaptersAdded > 0) { + assert(steps > 2); + const cfg_scale_start_step = 1; + const cfg_scale_end_step = Math.ceil(steps / 2); + assert(cfg_scale_end_step > cfg_scale_start_step); + + const negCond = g.addNode({ + type: 'flux_text_encoder', + id: getPrefixedId('flux_text_encoder'), + prompt: '', + }); + + g.addEdge(modelLoader, 'clip', negCond, 'clip'); + g.addEdge(modelLoader, 't5_encoder', negCond, 't5_encoder'); + g.addEdge(modelLoader, 'max_seq_len', negCond, 't5_max_seq_len'); + g.addEdge(negCond, 'conditioning', noise, 'negative_text_conditioning'); + + g.updateNode(noise, { + cfg_scale: 3, + cfg_scale_start_step, + cfg_scale_end_step, + }); + g.addEdge(ipAdapterCollector, 'collection', noise, 'ip_adapter'); + } else { + g.deleteNode(ipAdapterCollector.id); + } + if (state.system.shouldUseNSFWChecker) { canvasOutput = addNSFWChecker(g, canvasOutput); } diff --git a/invokeai/frontend/web/src/services/api/schema.ts b/invokeai/frontend/web/src/services/api/schema.ts index e16aedccd23..458472f507f 100644 --- a/invokeai/frontend/web/src/services/api/schema.ts +++ b/invokeai/frontend/web/src/services/api/schema.ts @@ -6487,6 +6487,29 @@ export type components = { * @default null */ positive_text_conditioning?: components["schemas"]["FluxConditioningField"]; + /** + * @description Negative conditioning tensor. Can be None if cfg_scale is 1.0. + * @default null + */ + negative_text_conditioning?: components["schemas"]["FluxConditioningField"] | null; + /** + * CFG Scale + * @description Classifier-Free Guidance scale + * @default 1 + */ + cfg_scale?: number | number[]; + /** + * CFG Scale Start Step + * @description Index of the first step to apply cfg_scale. Negative indices count backwards from the the last step (e.g. a value of -1 refers to the final step). + * @default 0 + */ + cfg_scale_start_step?: number; + /** + * CFG Scale End Step + * @description Index of the last step to apply cfg_scale. Negative indices count backwards from the last step (e.g. a value of -1 refers to the final step). + * @default -1 + */ + cfg_scale_end_step?: number; /** * Width * @description Width of the generated image. @@ -6528,6 +6551,12 @@ export type components = { * @default null */ controlnet_vae?: components["schemas"]["VAEField"] | null; + /** + * IP-Adapter + * @description IP-Adapter to apply + * @default null + */ + ip_adapter?: components["schemas"]["IPAdapterField"] | components["schemas"]["IPAdapterField"][] | null; /** * type * @default flux_denoise @@ -6536,6 +6565,74 @@ export type components = { */ type: "flux_denoise"; }; + /** + * FLUX IP-Adapter + * @description Collects FLUX IP-Adapter info to pass to other nodes. + */ + FluxIPAdapterInvocation: { + /** + * Id + * @description The id of this instance of an invocation. Must be unique among all instances of invocations. + */ + id: string; + /** + * Is Intermediate + * @description Whether or not this is an intermediate invocation. + * @default false + */ + is_intermediate?: boolean; + /** + * Use Cache + * @description Whether or not to use the cache + * @default true + */ + use_cache?: boolean; + /** + * Image + * @description The IP-Adapter image prompt(s). + * @default null + */ + image?: components["schemas"]["ImageField"] | components["schemas"]["ImageField"][]; + /** + * IP-Adapter Model + * @description The IP-Adapter model. + * @default null + */ + ip_adapter_model?: components["schemas"]["ModelIdentifierField"]; + /** + * Clip Vision Model + * @description CLIP Vision model to use. + * @default ViT-L + * @constant + * @enum {string} + */ + clip_vision_model?: "ViT-L"; + /** + * Weight + * @description The weight given to the IP-Adapter + * @default 1 + */ + weight?: number | number[]; + /** + * Begin Step Percent + * @description When the IP-Adapter is first applied (% of total steps) + * @default 0 + */ + begin_step_percent?: number; + /** + * End Step Percent + * @description When the IP-Adapter is last applied (% of total steps) + * @default 1 + */ + end_step_percent?: number; + /** + * type + * @default flux_ip_adapter + * @constant + * @enum {string} + */ + type: "flux_ip_adapter"; + }; /** * FLUX LoRA * @description Apply a LoRA model to a FLUX transformer and/or text encoder. @@ -6981,7 +7078,7 @@ export type components = { * @description The nodes in this graph */ nodes?: { - [key: string]: components["schemas"]["AddInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"]; + [key: string]: components["schemas"]["AddInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"]; }; /** * Edges @@ -7482,7 +7579,7 @@ export type components = { * @default ViT-H * @enum {string} */ - clip_vision_model?: "ViT-H" | "ViT-G"; + clip_vision_model?: "ViT-H" | "ViT-G" | "ViT-L"; /** * Weight * @description The weight given to the IP-Adapter @@ -7600,7 +7697,7 @@ export type components = { * @description The CLIP Vision model * @enum {string} */ - clip_vision_model: "ViT-H" | "ViT-G"; + clip_vision_model: "ViT-L" | "ViT-H" | "ViT-G"; /** * Method * @description Method to apply IP Weights with @@ -9466,7 +9563,7 @@ export type components = { * Invocation * @description The ID of the invocation */ - invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"]; + invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"]; /** * Invocation Source Id * @description The ID of the prepared invocation's source node @@ -9524,7 +9621,7 @@ export type components = { * Invocation * @description The ID of the invocation */ - invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"]; + invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"]; /** * Invocation Source Id * @description The ID of the prepared invocation's source node @@ -9607,6 +9704,7 @@ export type components = { float_to_int: components["schemas"]["IntegerOutput"]; flux_controlnet: components["schemas"]["FluxControlNetOutput"]; flux_denoise: components["schemas"]["LatentsOutput"]; + flux_ip_adapter: components["schemas"]["IPAdapterOutput"]; flux_lora_collection_loader: components["schemas"]["FluxLoRALoaderOutput"]; flux_lora_loader: components["schemas"]["FluxLoRALoaderOutput"]; flux_model_loader: components["schemas"]["FluxModelLoaderOutput"]; @@ -9772,7 +9870,7 @@ export type components = { * Invocation * @description The ID of the invocation */ - invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"]; + invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"]; /** * Invocation Source Id * @description The ID of the prepared invocation's source node @@ -9841,7 +9939,7 @@ export type components = { * Invocation * @description The ID of the invocation */ - invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"]; + invocation: components["schemas"]["AddInvocation"] | components["schemas"]["AlphaMaskToTensorInvocation"] | components["schemas"]["BlankImageInvocation"] | components["schemas"]["BlendLatentsInvocation"] | components["schemas"]["BooleanCollectionInvocation"] | components["schemas"]["BooleanInvocation"] | components["schemas"]["BoundingBoxInvocation"] | components["schemas"]["CLIPSkipInvocation"] | components["schemas"]["CV2InfillInvocation"] | components["schemas"]["CalculateImageTilesEvenSplitInvocation"] | components["schemas"]["CalculateImageTilesInvocation"] | components["schemas"]["CalculateImageTilesMinimumOverlapInvocation"] | components["schemas"]["CannyEdgeDetectionInvocation"] | components["schemas"]["CannyImageProcessorInvocation"] | components["schemas"]["CanvasPasteBackInvocation"] | components["schemas"]["CanvasV2MaskAndCropInvocation"] | components["schemas"]["CenterPadCropInvocation"] | components["schemas"]["CollectInvocation"] | components["schemas"]["ColorCorrectInvocation"] | components["schemas"]["ColorInvocation"] | components["schemas"]["ColorMapImageProcessorInvocation"] | components["schemas"]["ColorMapInvocation"] | components["schemas"]["CompelInvocation"] | components["schemas"]["ConditioningCollectionInvocation"] | components["schemas"]["ConditioningInvocation"] | components["schemas"]["ContentShuffleImageProcessorInvocation"] | components["schemas"]["ContentShuffleInvocation"] | components["schemas"]["ControlNetInvocation"] | components["schemas"]["CoreMetadataInvocation"] | components["schemas"]["CreateDenoiseMaskInvocation"] | components["schemas"]["CreateGradientMaskInvocation"] | components["schemas"]["CropLatentsCoreInvocation"] | components["schemas"]["CvInpaintInvocation"] | components["schemas"]["DWOpenposeDetectionInvocation"] | components["schemas"]["DWOpenposeImageProcessorInvocation"] | components["schemas"]["DenoiseLatentsInvocation"] | components["schemas"]["DepthAnythingDepthEstimationInvocation"] | components["schemas"]["DepthAnythingImageProcessorInvocation"] | components["schemas"]["DivideInvocation"] | components["schemas"]["DynamicPromptInvocation"] | components["schemas"]["ESRGANInvocation"] | components["schemas"]["FLUXLoRACollectionLoader"] | components["schemas"]["FaceIdentifierInvocation"] | components["schemas"]["FaceMaskInvocation"] | components["schemas"]["FaceOffInvocation"] | components["schemas"]["FloatCollectionInvocation"] | components["schemas"]["FloatInvocation"] | components["schemas"]["FloatLinearRangeInvocation"] | components["schemas"]["FloatMathInvocation"] | components["schemas"]["FloatToIntegerInvocation"] | components["schemas"]["FluxControlNetInvocation"] | components["schemas"]["FluxDenoiseInvocation"] | components["schemas"]["FluxIPAdapterInvocation"] | components["schemas"]["FluxLoRALoaderInvocation"] | components["schemas"]["FluxModelLoaderInvocation"] | components["schemas"]["FluxTextEncoderInvocation"] | components["schemas"]["FluxVaeDecodeInvocation"] | components["schemas"]["FluxVaeEncodeInvocation"] | components["schemas"]["FreeUInvocation"] | components["schemas"]["GroundingDinoInvocation"] | components["schemas"]["HEDEdgeDetectionInvocation"] | components["schemas"]["HedImageProcessorInvocation"] | components["schemas"]["HeuristicResizeInvocation"] | components["schemas"]["IPAdapterInvocation"] | components["schemas"]["IdealSizeInvocation"] | components["schemas"]["ImageBlurInvocation"] | components["schemas"]["ImageChannelInvocation"] | components["schemas"]["ImageChannelMultiplyInvocation"] | components["schemas"]["ImageChannelOffsetInvocation"] | components["schemas"]["ImageCollectionInvocation"] | components["schemas"]["ImageConvertInvocation"] | components["schemas"]["ImageCropInvocation"] | components["schemas"]["ImageHueAdjustmentInvocation"] | components["schemas"]["ImageInverseLerpInvocation"] | components["schemas"]["ImageInvocation"] | components["schemas"]["ImageLerpInvocation"] | components["schemas"]["ImageMaskToTensorInvocation"] | components["schemas"]["ImageMultiplyInvocation"] | components["schemas"]["ImageNSFWBlurInvocation"] | components["schemas"]["ImagePasteInvocation"] | components["schemas"]["ImageResizeInvocation"] | components["schemas"]["ImageScaleInvocation"] | components["schemas"]["ImageToLatentsInvocation"] | components["schemas"]["ImageWatermarkInvocation"] | components["schemas"]["InfillColorInvocation"] | components["schemas"]["InfillPatchMatchInvocation"] | components["schemas"]["InfillTileInvocation"] | components["schemas"]["IntegerCollectionInvocation"] | components["schemas"]["IntegerInvocation"] | components["schemas"]["IntegerMathInvocation"] | components["schemas"]["InvertTensorMaskInvocation"] | components["schemas"]["IterateInvocation"] | components["schemas"]["LaMaInfillInvocation"] | components["schemas"]["LatentsCollectionInvocation"] | components["schemas"]["LatentsInvocation"] | components["schemas"]["LatentsToImageInvocation"] | components["schemas"]["LeresImageProcessorInvocation"] | components["schemas"]["LineartAnimeEdgeDetectionInvocation"] | components["schemas"]["LineartAnimeImageProcessorInvocation"] | components["schemas"]["LineartEdgeDetectionInvocation"] | components["schemas"]["LineartImageProcessorInvocation"] | components["schemas"]["LoRACollectionLoader"] | components["schemas"]["LoRALoaderInvocation"] | components["schemas"]["LoRASelectorInvocation"] | components["schemas"]["MLSDDetectionInvocation"] | components["schemas"]["MainModelLoaderInvocation"] | components["schemas"]["MaskCombineInvocation"] | components["schemas"]["MaskEdgeInvocation"] | components["schemas"]["MaskFromAlphaInvocation"] | components["schemas"]["MaskFromIDInvocation"] | components["schemas"]["MaskTensorToImageInvocation"] | components["schemas"]["MediaPipeFaceDetectionInvocation"] | components["schemas"]["MediapipeFaceProcessorInvocation"] | components["schemas"]["MergeMetadataInvocation"] | components["schemas"]["MergeTilesToImageInvocation"] | components["schemas"]["MetadataInvocation"] | components["schemas"]["MetadataItemInvocation"] | components["schemas"]["MidasDepthImageProcessorInvocation"] | components["schemas"]["MlsdImageProcessorInvocation"] | components["schemas"]["ModelIdentifierInvocation"] | components["schemas"]["MultiplyInvocation"] | components["schemas"]["NoiseInvocation"] | components["schemas"]["NormalMapInvocation"] | components["schemas"]["NormalbaeImageProcessorInvocation"] | components["schemas"]["PairTileImageInvocation"] | components["schemas"]["PiDiNetEdgeDetectionInvocation"] | components["schemas"]["PidiImageProcessorInvocation"] | components["schemas"]["PromptsFromFileInvocation"] | components["schemas"]["RandomFloatInvocation"] | components["schemas"]["RandomIntInvocation"] | components["schemas"]["RandomRangeInvocation"] | components["schemas"]["RangeInvocation"] | components["schemas"]["RangeOfSizeInvocation"] | components["schemas"]["RectangleMaskInvocation"] | components["schemas"]["ResizeLatentsInvocation"] | components["schemas"]["RoundInvocation"] | components["schemas"]["SDXLCompelPromptInvocation"] | components["schemas"]["SDXLLoRACollectionLoader"] | components["schemas"]["SDXLLoRALoaderInvocation"] | components["schemas"]["SDXLModelLoaderInvocation"] | components["schemas"]["SDXLRefinerCompelPromptInvocation"] | components["schemas"]["SDXLRefinerModelLoaderInvocation"] | components["schemas"]["SaveImageInvocation"] | components["schemas"]["ScaleLatentsInvocation"] | components["schemas"]["SchedulerInvocation"] | components["schemas"]["SeamlessModeInvocation"] | components["schemas"]["SegmentAnythingInvocation"] | components["schemas"]["SegmentAnythingProcessorInvocation"] | components["schemas"]["ShowImageInvocation"] | components["schemas"]["SpandrelImageToImageAutoscaleInvocation"] | components["schemas"]["SpandrelImageToImageInvocation"] | components["schemas"]["StepParamEasingInvocation"] | components["schemas"]["StringCollectionInvocation"] | components["schemas"]["StringInvocation"] | components["schemas"]["StringJoinInvocation"] | components["schemas"]["StringJoinThreeInvocation"] | components["schemas"]["StringReplaceInvocation"] | components["schemas"]["StringSplitInvocation"] | components["schemas"]["StringSplitNegInvocation"] | components["schemas"]["SubtractInvocation"] | components["schemas"]["T2IAdapterInvocation"] | components["schemas"]["TileResamplerProcessorInvocation"] | components["schemas"]["TileToPropertiesInvocation"] | components["schemas"]["TiledMultiDiffusionDenoiseLatents"] | components["schemas"]["UnsharpMaskInvocation"] | components["schemas"]["VAELoaderInvocation"] | components["schemas"]["ZoeDepthImageProcessorInvocation"]; /** * Invocation Source Id * @description The ID of the prepared invocation's source node From f53823b45edff1dfa2c14b262f2283f163b0b71e Mon Sep 17 00:00:00 2001 From: psychedelicious <4822129+psychedelicious@users.noreply.github.com> Date: Wed, 23 Oct 2024 08:29:14 +1000 Subject: [PATCH 27/30] fix(ui): update CLIP Vision when ipa model changes --- .../features/controlLayers/store/canvasSlice.ts | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/invokeai/frontend/web/src/features/controlLayers/store/canvasSlice.ts b/invokeai/frontend/web/src/features/controlLayers/store/canvasSlice.ts index b4b784e80d6..fd7b4958436 100644 --- a/invokeai/frontend/web/src/features/controlLayers/store/canvasSlice.ts +++ b/invokeai/frontend/web/src/features/controlLayers/store/canvasSlice.ts @@ -381,6 +381,13 @@ export const canvasSlice = createSlice({ return; } entity.ipAdapter.model = modelConfig ? zModelIdentifierField.parse(modelConfig) : null; + // Ensure that the IP Adapter model is compatible with the CLIP Vision model + if (entity.ipAdapter.model?.base === 'flux') { + entity.ipAdapter.clipVisionModel = 'ViT-L'; + } else if (entity.ipAdapter.clipVisionModel === 'ViT-L') { + // Fall back to ViT-H (ViT-G would also work) + entity.ipAdapter.clipVisionModel = 'ViT-H'; + } }, referenceImageIPAdapterCLIPVisionModelChanged: ( state, @@ -577,6 +584,13 @@ export const canvasSlice = createSlice({ return; } referenceImage.ipAdapter.model = modelConfig ? zModelIdentifierField.parse(modelConfig) : null; + // Ensure that the IP Adapter model is compatible with the CLIP Vision model + if (referenceImage.ipAdapter.model?.base === 'flux') { + referenceImage.ipAdapter.clipVisionModel = 'ViT-L'; + } else if (referenceImage.ipAdapter.clipVisionModel === 'ViT-L') { + // Fall back to ViT-H (ViT-G would also work) + referenceImage.ipAdapter.clipVisionModel = 'ViT-H'; + } }, rgIPAdapterCLIPVisionModelChanged: ( state, From bf3260446d1744a534cdc0a5d7077cefb7fc6242 Mon Sep 17 00:00:00 2001 From: psychedelicious <4822129+psychedelicious@users.noreply.github.com> Date: Wed, 23 Oct 2024 08:30:11 +1000 Subject: [PATCH 28/30] fix(ui): use `flux_ip_adapter` for flux --- .../util/graph/generation/addIPAdapters.ts | 50 ++++++++++++++----- 1 file changed, 37 insertions(+), 13 deletions(-) diff --git a/invokeai/frontend/web/src/features/nodes/util/graph/generation/addIPAdapters.ts b/invokeai/frontend/web/src/features/nodes/util/graph/generation/addIPAdapters.ts index c4c95d4a5a2..fe91b52f22b 100644 --- a/invokeai/frontend/web/src/features/nodes/util/graph/generation/addIPAdapters.ts +++ b/invokeai/frontend/web/src/features/nodes/util/graph/generation/addIPAdapters.ts @@ -34,19 +34,43 @@ const addIPAdapter = (entity: CanvasReferenceImageState, g: Graph, collector: In assert(image, 'IP Adapter image is required'); assert(model, 'IP Adapter model is required'); - const ipAdapterNode = g.addNode({ - id: `ip_adapter_${id}`, - type: 'ip_adapter', - weight, - method, - ip_adapter_model: model, - clip_vision_model: clipVisionModel, - begin_step_percent: beginEndStepPct[0], - end_step_percent: beginEndStepPct[1], - image: { - image_name: image.image_name, - }, - }); + let ipAdapterNode: Invocation<'flux_ip_adapter' | 'ip_adapter'>; + + if (model.base === 'flux') { + assert(clipVisionModel === 'ViT-L', 'ViT-L is the only supported CLIP Vision model for FLUX IP adapter'); + ipAdapterNode = g.addNode({ + id: `ip_adapter_${id}`, + type: 'flux_ip_adapter', + weight, + ip_adapter_model: model, + clip_vision_model: clipVisionModel, + begin_step_percent: beginEndStepPct[0], + end_step_percent: beginEndStepPct[1], + image: { + image_name: image.image_name, + }, + }); + } else { + // model.base === SD1.5 or SDXL + assert( + clipVisionModel === 'ViT-H' || clipVisionModel === 'ViT-G', + 'ViT-G and ViT-H are the only supported CLIP Vision models for SD1.5 and SDXL IP adapters' + ); + ipAdapterNode = g.addNode({ + id: `ip_adapter_${id}`, + type: 'ip_adapter', + weight, + method, + ip_adapter_model: model, + clip_vision_model: clipVisionModel, + begin_step_percent: beginEndStepPct[0], + end_step_percent: beginEndStepPct[1], + image: { + image_name: image.image_name, + }, + }); + } + g.addEdge(ipAdapterNode, 'ip_adapter', collector, 'item'); }; From ee8975401aa6379dc858d089c049bbef7a2e2b2b Mon Sep 17 00:00:00 2001 From: psychedelicious <4822129+psychedelicious@users.noreply.github.com> Date: Wed, 23 Oct 2024 08:31:10 +1000 Subject: [PATCH 29/30] fix(ui): remove special handling for flux in `IPAdapterModel` This masked an issue w/ the CLIP Vision model. Issue is now handled in reducer/graph builder. --- .../components/IPAdapter/IPAdapterModel.tsx | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/invokeai/frontend/web/src/features/controlLayers/components/IPAdapter/IPAdapterModel.tsx b/invokeai/frontend/web/src/features/controlLayers/components/IPAdapter/IPAdapterModel.tsx index 218582535f3..682c272f892 100644 --- a/invokeai/frontend/web/src/features/controlLayers/components/IPAdapter/IPAdapterModel.tsx +++ b/invokeai/frontend/web/src/features/controlLayers/components/IPAdapter/IPAdapterModel.tsx @@ -71,19 +71,15 @@ export const IPAdapterModel = memo(({ modelKey, onChangeModel, clipVisionModel, }); const clipVisionOptions = useMemo(() => { - if (isFLUX) { - return CLIP_VISION_OPTIONS.map((option) => ({ ...option, isDisabled: option.value !== FLUX_CLIP_VISION })); - } else { - return CLIP_VISION_OPTIONS; - } + return CLIP_VISION_OPTIONS.map((option) => ({ + ...option, + isDisabled: isFLUX && option.value !== FLUX_CLIP_VISION, + })); }, [isFLUX]); const clipVisionModelValue = useMemo(() => { - if (isFLUX) { - return CLIP_VISION_OPTIONS.find((o) => o.value === FLUX_CLIP_VISION); - } return CLIP_VISION_OPTIONS.find((o) => o.value === clipVisionModel); - }, [clipVisionModel, isFLUX]); + }, [clipVisionModel]); return ( From 61496fdcbc657eb070916c152f3e014e3dfd240a Mon Sep 17 00:00:00 2001 From: psychedelicious <4822129+psychedelicious@users.noreply.github.com> Date: Wed, 23 Oct 2024 08:34:15 +1000 Subject: [PATCH 30/30] fix(nodes): load IP Adapter images as RGB FLUX IP Adapter only works with RGB. Did the same for non-FLUX to be safe & consistent, though I don't think it's strictly necessary. --- invokeai/app/invocations/denoise_latents.py | 4 +++- invokeai/app/invocations/flux_denoise.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/invokeai/app/invocations/denoise_latents.py b/invokeai/app/invocations/denoise_latents.py index 34295b5e229..0c3a8921547 100644 --- a/invokeai/app/invocations/denoise_latents.py +++ b/invokeai/app/invocations/denoise_latents.py @@ -547,7 +547,9 @@ def prep_ip_adapter_image_prompts( if not isinstance(single_ipa_image_fields, list): single_ipa_image_fields = [single_ipa_image_fields] - single_ipa_images = [context.images.get_pil(image.image_name) for image in single_ipa_image_fields] + single_ipa_images = [ + context.images.get_pil(image.image_name, mode="RGB") for image in single_ipa_image_fields + ] with image_encoder_model_info as image_encoder_model: assert isinstance(image_encoder_model, CLIPVisionModelWithProjection) # Get image embeddings from CLIP and ImageProjModel. diff --git a/invokeai/app/invocations/flux_denoise.py b/invokeai/app/invocations/flux_denoise.py index 2e7dba406c2..c9907ce4082 100644 --- a/invokeai/app/invocations/flux_denoise.py +++ b/invokeai/app/invocations/flux_denoise.py @@ -576,7 +576,7 @@ def _prep_ip_adapter_image_prompt_clip_embeds( f"FLUX IP-Adapter only supports a single image prompt (received {len(ipa_image_fields)})." ) - ipa_images = [context.images.get_pil(image.image_name) for image in ipa_image_fields] + ipa_images = [context.images.get_pil(image.image_name, mode="RGB") for image in ipa_image_fields] pos_images: list[npt.NDArray[np.uint8]] = [] neg_images: list[npt.NDArray[np.uint8]] = []