diff --git a/diffusers/scripts/exp_sdxl_demo.sh b/diffusers/scripts/exp_sdxl_demo.sh new file mode 100755 index 0000000..602d30e --- /dev/null +++ b/diffusers/scripts/exp_sdxl_demo.sh @@ -0,0 +1 @@ +python scripts/train_secmi.py --model-type sdxl --ckpt-path ../models/diffusers/Kohaku-XL-Epsilon/ --member-dataset hakubooru-2-5k-member --holdout-dataset hakubooru-2-5k-nonmember --batch-size 3 --demo True diff --git a/diffusers/scripts/test.py b/diffusers/scripts/test.py new file mode 100644 index 0000000..2332fcf --- /dev/null +++ b/diffusers/scripts/test.py @@ -0,0 +1,10 @@ +import numpy as np +from sklearn import preprocessing + +member_features = np.array([np.nan, np.inf, -np.inf, 1000, 0, 4, 30000, -899]) +membermax, membermin = member_features[~np.isposinf(member_features)].max(), member_features[~np.isneginf(member_features)].min() +member_features = np.nan_to_num(member_features, nan=0, posinf=membermax, neginf=membermin) + +x = preprocessing.scale(member_features) +x = np.nan_to_num(member_features, nan=0) +print(x, member_features) \ No newline at end of file diff --git a/diffusers/scripts/train_gsa.py b/diffusers/scripts/train_gsa.py index f879dfe..f2cc43d 100644 --- a/diffusers/scripts/train_gsa.py +++ b/diffusers/scripts/train_gsa.py @@ -50,9 +50,9 @@ def get_reverse_denoise_results(pipe, dataloader, device, gsa_mode, demo): features, path_log = [], [] for batch_idx, batch in enumerate(tqdm.tqdm(dataloader)): path_log.extend(batch['path']) - latents, encoder_hidden_states = pipe.prepare_inputs(batch, weight_dtype, device) + latents, encoder_hidden_states, prompts = pipe.prepare_inputs(batch, weight_dtype, device) out = pipe(\ - accelerator=accelerator, optimizer=optimizer, prompt=None, latents=latents, \ + accelerator=accelerator, optimizer=optimizer, prompt=prompts, latents=latents, \ prompt_embeds=encoder_hidden_states, guidance_scale=1.0, num_inference_steps=20, gsa_mode=gsa_mode) gsa_features = out.gsa_features # # [bsz x Tensor(num_p)] # print(f"gsa: {gsa_features}") diff --git a/diffusers/scripts/train_pfami.py b/diffusers/scripts/train_pfami.py index 22f55cd..a1aacd1 100644 --- a/diffusers/scripts/train_pfami.py +++ b/diffusers/scripts/train_pfami.py @@ -66,8 +66,8 @@ def get_reverse_denoise_results(pipe, dataloader, device, strengths, demo): for strength in strengths: input_batch = deepcopy(batch) input_batch["pixel_values"] = image_perturbation(input_batch["pixel_values"], strength) - latents, encoder_hidden_states = pipe.prepare_inputs(input_batch, weight_dtype, device) - out = pipe(prompt=None, latents=latents, prompt_embeds=encoder_hidden_states, \ + latents, encoder_hidden_states, prompts = pipe.prepare_inputs(input_batch, weight_dtype, device) + out = pipe(prompt=prompts, latents=latents, prompt_embeds=encoder_hidden_states, \ guidance_scale=1.0, num_inference_steps=100) _, posterior_results, denoising_results = out.images, out.posterior_results, out.denoising_results # [len(attack_timesteps) x [B, 4, 64, 64]] diff --git a/diffusers/scripts/train_pia.py b/diffusers/scripts/train_pia.py index c4c69f1..84582a8 100644 --- a/diffusers/scripts/train_pia.py +++ b/diffusers/scripts/train_pia.py @@ -35,9 +35,9 @@ def get_reverse_denoise_results(pipe, dataloader, device, normalized, demo): scores_sum, scores_all_steps, path_log = [], [], [] for batch_idx, batch in enumerate(tqdm.tqdm(dataloader)): path_log.extend(batch['path']) - latents, encoder_hidden_states = pipe.prepare_inputs(batch, weight_dtype, device) + latents, encoder_hidden_states, prompts = pipe.prepare_inputs(batch, weight_dtype, device) out = pipe(\ - prompt=None, latents=latents, prompt_embeds=encoder_hidden_states, \ + prompt=prompts, latents=latents, prompt_embeds=encoder_hidden_states, \ guidance_scale=1.0, num_inference_steps=100, normalized=normalized, strength=0.5) _, posterior_results, denoising_results = out.images, out.posterior_results, out.denoising_results diff --git a/diffusers/scripts/train_secmi.py b/diffusers/scripts/train_secmi.py index 3a3a357..7820309 100644 --- a/diffusers/scripts/train_secmi.py +++ b/diffusers/scripts/train_secmi.py @@ -9,7 +9,7 @@ import argparse import json,time -from stable_copyright import SecMILatentDiffusionPipeline, SecMIStableDiffusionPipeline, SecMIDDIMScheduler +from stable_copyright import SecMILatentDiffusionPipeline, SecMIStableDiffusionPipeline, SecMIDDIMScheduler, SecMIStableDiffusionXLPipeline from stable_copyright import load_dataset, benchmark, test @@ -22,7 +22,9 @@ def load_pipeline(ckpt_path, device='cuda:0', model_type='sd'): pipe = SecMILatentDiffusionPipeline.from_pretrained(ckpt_path, torch_dtype=torch.float32) # pipe.scheduler = SecMIDDIMScheduler.from_config(pipe.scheduler.config) elif model_type == 'sdxl': - raise NotImplementedError('SDXL not implemented yet') + pipe = SecMIStableDiffusionXLPipeline.from_pretrained(ckpt_path, torch_dtype=torch.float32) + pipe.scheduler = SecMIDDIMScheduler.from_config(pipe.scheduler.config) + pipe = pipe.to(device) else: raise NotImplementedError(f'Unrecognized model type {model_type}') return pipe @@ -34,8 +36,8 @@ def get_reverse_denoise_results(pipe, dataloader, device, demo=False): scores_50_step, scores_all_steps, path_log = [], [], [] for batch_idx, batch in enumerate(tqdm.tqdm(dataloader)): path_log.extend(batch['path']) - latents, encoder_hidden_states = pipe.prepare_inputs(batch, weight_dtype, device) - out = pipe(prompt=None, latents=latents, prompt_embeds=encoder_hidden_states, guidance_scale=1.0, num_inference_steps=100) + latents, encoder_hidden_states, prompts = pipe.prepare_inputs(batch, weight_dtype, device) + out = pipe(prompt=prompts, latents=latents, prompt_embeds=encoder_hidden_states, guidance_scale=1.0, num_inference_steps=100) _, posterior_results, denoising_results = out.images, out.posterior_results, out.denoising_results # print(f'posterior {posterior_results[0].shape}') diff --git a/diffusers/stable_copyright/__init__.py b/diffusers/stable_copyright/__init__.py index a187f7f..ec7d314 100644 --- a/diffusers/stable_copyright/__init__.py +++ b/diffusers/stable_copyright/__init__.py @@ -3,6 +3,7 @@ from .secmi_pipeline_stable_diffusion import SecMIStableDiffusionPipeline from .secmi_scheduling_ddim import SecMIDDIMScheduler from .secmi_pipeline_latent_diffusion import SecMILatentDiffusionPipeline +from .secmi_pipeline_sdxl import SecMIStableDiffusionXLPipeline from .pia_pipeline_stable_diffusion import PIAStableDiffusionPipeline from .pia_pipeline_latent_diffusion import PIALatentDiffusionPipeline diff --git a/diffusers/stable_copyright/data_utils.py b/diffusers/stable_copyright/data_utils.py index 0691117..b61bfa6 100644 --- a/diffusers/stable_copyright/data_utils.py +++ b/diffusers/stable_copyright/data_utils.py @@ -138,7 +138,8 @@ def collate_fn(examples): else: input_ids = torch.stack([example["input_ids"] for example in examples]) path = [example["path"] for example in examples] - return {"pixel_values": pixel_values, "input_ids": input_ids, "path": path} + prompts = [example["prompt"] for example in examples] + return {"pixel_values": pixel_values, "input_ids": input_ids, "path": path, "prompts": prompts} class StandardTransform: def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None: @@ -234,7 +235,7 @@ def __getitem__(self, index: int): image, input_id = StandardTransform(self.transforms, None)(image, input_id) # return image, target - return {"pixel_values": image, "input_ids": input_id, 'caption': caption, 'path': img_name} + return {"pixel_values": image, "input_ids": input_id, 'prompt': caption, 'path': img_name} def load_dataset(dataset_root, ckpt_path, dataset: str='laion-aesthetic-2-5k', batch_size: int=6, model_type='sd'): diff --git a/diffusers/stable_copyright/gsa_pipeline_latent_diffusion.py b/diffusers/stable_copyright/gsa_pipeline_latent_diffusion.py index 79299f5..2fd68df 100644 --- a/diffusers/stable_copyright/gsa_pipeline_latent_diffusion.py +++ b/diffusers/stable_copyright/gsa_pipeline_latent_diffusion.py @@ -80,7 +80,7 @@ def prepare_inputs(self, batch, weight_dtype, device): for param in self.unet.parameters(): param.requires_grad = True - return latents, encoder_hidden_states + return latents, encoder_hidden_states, None # borrow from Image2Image diff --git a/diffusers/stable_copyright/gsa_pipeline_stable_diffusion.py b/diffusers/stable_copyright/gsa_pipeline_stable_diffusion.py index dece92a..e614040 100644 --- a/diffusers/stable_copyright/gsa_pipeline_stable_diffusion.py +++ b/diffusers/stable_copyright/gsa_pipeline_stable_diffusion.py @@ -33,7 +33,7 @@ def prepare_inputs(self, batch, weight_dtype, device): for param in self.unet.parameters(): param.requires_grad = True - return latents, encoder_hidden_states + return latents, encoder_hidden_states, None # borrow from Image2Image diff --git a/diffusers/stable_copyright/pfami_pipeline_latent_diffusion.py b/diffusers/stable_copyright/pfami_pipeline_latent_diffusion.py index 4bf4bed..41acc8f 100644 --- a/diffusers/stable_copyright/pfami_pipeline_latent_diffusion.py +++ b/diffusers/stable_copyright/pfami_pipeline_latent_diffusion.py @@ -63,7 +63,7 @@ def prepare_inputs(self, batch, weight_dtype, device): latents = self.vae.encode(pixel_values)[0] encoder_hidden_states = None - return latents, encoder_hidden_states + return latents, encoder_hidden_states, None # borrow from Image2Image diff --git a/diffusers/stable_copyright/pfami_pipeline_stable_diffusion.py b/diffusers/stable_copyright/pfami_pipeline_stable_diffusion.py index 63c771e..aaaf366 100644 --- a/diffusers/stable_copyright/pfami_pipeline_stable_diffusion.py +++ b/diffusers/stable_copyright/pfami_pipeline_stable_diffusion.py @@ -30,7 +30,7 @@ def prepare_inputs(self, batch, weight_dtype, device): latents = latents * 0.18215 encoder_hidden_states = self.text_encoder(input_ids)[0] - return latents, encoder_hidden_states + return latents, encoder_hidden_states, None # borrow from Image2Image diff --git a/diffusers/stable_copyright/pia_pipeline_latent_diffusion.py b/diffusers/stable_copyright/pia_pipeline_latent_diffusion.py index 374c46c..f3bc3a8 100644 --- a/diffusers/stable_copyright/pia_pipeline_latent_diffusion.py +++ b/diffusers/stable_copyright/pia_pipeline_latent_diffusion.py @@ -63,7 +63,7 @@ def prepare_inputs(self, batch, weight_dtype, device): latents = self.vae.encode(pixel_values)[0] encoder_hidden_states = None - return latents, encoder_hidden_states + return latents, encoder_hidden_states, None # borrow from Image2Image diff --git a/diffusers/stable_copyright/pia_pipeline_stable_diffusion.py b/diffusers/stable_copyright/pia_pipeline_stable_diffusion.py index be6f307..b32439c 100644 --- a/diffusers/stable_copyright/pia_pipeline_stable_diffusion.py +++ b/diffusers/stable_copyright/pia_pipeline_stable_diffusion.py @@ -30,7 +30,7 @@ def prepare_inputs(self, batch, weight_dtype, device): latents = latents * 0.18215 encoder_hidden_states = self.text_encoder(input_ids)[0] - return latents, encoder_hidden_states + return latents, encoder_hidden_states, None # borrow from Image2Image diff --git a/diffusers/stable_copyright/secmi_pipeline_latent_diffusion.py b/diffusers/stable_copyright/secmi_pipeline_latent_diffusion.py index a4717d7..e2efb82 100644 --- a/diffusers/stable_copyright/secmi_pipeline_latent_diffusion.py +++ b/diffusers/stable_copyright/secmi_pipeline_latent_diffusion.py @@ -64,7 +64,7 @@ def prepare_inputs(self, batch, weight_dtype, device): latents = self.vae.encode(pixel_values)[0] encoder_hidden_states = None - return latents, encoder_hidden_states + return latents, encoder_hidden_states, None # borrow from Image2Image diff --git a/diffusers/stable_copyright/secmi_pipeline_sdxl.py b/diffusers/stable_copyright/secmi_pipeline_sdxl.py index a14277c..4775b25 100644 --- a/diffusers/stable_copyright/secmi_pipeline_sdxl.py +++ b/diffusers/stable_copyright/secmi_pipeline_sdxl.py @@ -21,9 +21,32 @@ class SecMIStableDiffusionXLPipeline( StableDiffusionXLPipeline ): - @torch.no_grad() - @replace_example_docstring(EXAMPLE_DOC_STRING) + def prepare_inputs(self, batch, weight_dtype, device): + pixel_values, prompts = batch["pixel_values"].to(weight_dtype), batch["prompts"] + if device == 'cuda': + pixel_values, input_ids = pixel_values.cuda(), input_ids.cuda() + + latents = self.vae.encode(pixel_values).latent_dist.sample() + latents = latents * 0.18215 + + return latents, None, prompts + + # borrow from Image2Image + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] # order=1 + # [601, 581, ..., 21, 1] + if hasattr(self.scheduler, "set_begin_index"): + self.scheduler.set_begin_index(t_start * self.scheduler.order) + + return timesteps, num_inference_steps - t_start + + + @torch.no_grad() def __call__( self, prompt: Union[str, List[str]] = None, @@ -33,7 +56,7 @@ def __call__( num_inference_steps: int = 50, timesteps: List[int] = None, denoising_end: Optional[float] = None, - guidance_scale: float = 5.0, + guidance_scale: float = 1.0, negative_prompt: Optional[Union[str, List[str]]] = None, negative_prompt_2: Optional[Union[str, List[str]]] = None, num_images_per_prompt: Optional[int] = 1, @@ -59,148 +82,9 @@ def __call__( clip_skip: Optional[int] = None, callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, callback_on_step_end_tensor_inputs: List[str] = ["latents"], + strength: float=0.2, **kwargs, ): - r""" - Function invoked when calling the pipeline for generation. - - Args: - prompt (`str` or `List[str]`, *optional*): - The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`. - instead. - prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is - used in both text-encoders - height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): - The height in pixels of the generated image. This is set to 1024 by default for the best results. - Anything below 512 pixels won't work well for - [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) - and checkpoints that are not specifically fine-tuned on low resolutions. - width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor): - The width in pixels of the generated image. This is set to 1024 by default for the best results. - Anything below 512 pixels won't work well for - [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0) - and checkpoints that are not specifically fine-tuned on low resolutions. - num_inference_steps (`int`, *optional*, defaults to 50): - The number of denoising steps. More denoising steps usually lead to a higher quality image at the - expense of slower inference. - timesteps (`List[int]`, *optional*): - Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument - in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is - passed will be used. Must be in descending order. - denoising_end (`float`, *optional*): - When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be - completed before it is intentionally prematurely terminated. As a result, the returned sample will - still retain a substantial amount of noise as determined by the discrete timesteps selected by the - scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a - "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image - Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output) - guidance_scale (`float`, *optional*, defaults to 5.0): - Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598). - `guidance_scale` is defined as `w` of equation 2. of [Imagen - Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale > - 1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`, - usually at the expense of lower image quality. - negative_prompt (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation. If not defined, one has to pass - `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is - less than `1`). - negative_prompt_2 (`str` or `List[str]`, *optional*): - The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and - `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders - num_images_per_prompt (`int`, *optional*, defaults to 1): - The number of images to generate per prompt. - eta (`float`, *optional*, defaults to 0.0): - Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to - [`schedulers.DDIMScheduler`], will be ignored for others. - generator (`torch.Generator` or `List[torch.Generator]`, *optional*): - One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html) - to make generation deterministic. - latents (`torch.FloatTensor`, *optional*): - Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image - generation. Can be used to tweak the same generation with different prompts. If not provided, a latents - tensor will ge generated by sampling using the supplied random `generator`. - prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not - provided, text embeddings will be generated from `prompt` input argument. - negative_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input - argument. - pooled_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. - If not provided, pooled text embeddings will be generated from `prompt` input argument. - negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*): - Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt - weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt` - input argument. - ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. - ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): - Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. - Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding - if `do_classifier_free_guidance` is set to `True`. - If not provided, embeddings are computed from the `ip_adapter_image` input argument. - output_type (`str`, *optional*, defaults to `"pil"`): - The output format of the generate image. Choose between - [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`. - return_dict (`bool`, *optional*, defaults to `True`): - Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead - of a plain tuple. - cross_attention_kwargs (`dict`, *optional*): - A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under - `self.processor` in - [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). - guidance_rescale (`float`, *optional*, defaults to 0.0): - Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are - Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of - [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). - Guidance rescale factor should fix overexposure when using zero terminal SNR. - original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled. - `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as - explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position - `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting - `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - For most cases, `target_size` should be set to the desired height and width of the generated image. If - not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in - section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). - negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - To negatively condition the generation process based on a specific image resolution. Part of SDXL's - micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)): - To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's - micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)): - To negatively condition the generation process based on a target image resolution. It should be as same - as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of - [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more - information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208. - callback_on_step_end (`Callable`, *optional*): - A function that calls at the end of each denoising steps during the inference. The function is called - with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, - callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by - `callback_on_step_end_tensor_inputs`. - callback_on_step_end_tensor_inputs (`List`, *optional*): - The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list - will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the - `._callback_tensor_inputs` attribute of your pipeline class. - - Examples: - - Returns: - [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`: - [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a - `tuple`. When returning a tuple, the first element is a list with the generated images. - """ callback = kwargs.pop("callback", None) callback_steps = kwargs.pop("callback_steps", None) @@ -226,6 +110,8 @@ def __call__( target_size = target_size or (height, width) # 1. Check inputs. Raise error if not correct + if pooled_prompt_embeds == None: + prompt_embeds, pooled_prompt_embeds = prompt_embeds self.check_inputs( prompt, prompt_2, @@ -288,6 +174,7 @@ def __call__( # 4. Prepare timesteps timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device) # 5. Prepare latent variables num_channels_latents = self.unet.config.in_channels @@ -330,24 +217,10 @@ def __call__( else: negative_add_time_ids = add_time_ids - if self.do_classifier_free_guidance: - prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0) - add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0) - add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0) - prompt_embeds = prompt_embeds.to(device) add_text_embeds = add_text_embeds.to(device) add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1) - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - image_embeds = self.prepare_ip_adapter_image_embeds( - ip_adapter_image, - ip_adapter_image_embeds, - device, - batch_size * num_images_per_prompt, - self.do_classifier_free_guidance, - ) - # 8. Denoising loop num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0) @@ -375,21 +248,53 @@ def __call__( guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim ).to(device=device, dtype=latents.dtype) + # SECMI + # get [x_201, x_181, ..., x_1] + print(timesteps) + posterior_results = [] + original_latents = latents.detach().clone() + for i, t in enumerate(timesteps): # from t_max to t_min + noise = randn_tensor(original_latents.shape, generator=generator, device=device, dtype=original_latents.dtype) + posterior_latents = self.scheduler.scale_model_input(original_latents, t) + posterior_latents = self.scheduler.add_noise(posterior_latents, noise, t) + posterior_results.append(posterior_latents.detach().clone()) + # print(f"{t} timestep posterior: {torch.sum(posterior_latents)}") + + # get [x_(201+20), x_(181+20), ..., x_(1+20)] + reverse_results = [] self._num_timesteps = len(timesteps) + for i, t in enumerate(timesteps): # from t_max to t_min + latent_model_input = self.scheduler.scale_model_input(posterior_results[i], t) + # predict the noise residual + added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + cross_attention_kwargs=self.cross_attention_kwargs, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + # compute the previous noisy sample x_t -> x_t-1 + reverse_latents = self.scheduler.reverse_step(noise_pred, t, latent_model_input, **extra_step_kwargs, return_dict=False)[0] + reverse_results.append(reverse_latents.detach().clone()) + + unit_t = timesteps[0] - timesteps[1] + denoising_results = [] with self.progress_bar(total=num_inference_steps) as progress_bar: for i, t in enumerate(timesteps): if self.interrupt: continue + latents = reverse_results[i] + t = t + unit_t # expand the latents if we are doing classifier free guidance - latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents - + latent_model_input = latents latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) # predict the noise residual added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids} - if ip_adapter_image is not None or ip_adapter_image_embeds is not None: - added_cond_kwargs["image_embeds"] = image_embeds noise_pred = self.unet( latent_model_input, t, @@ -411,6 +316,7 @@ def __call__( # compute the previous noisy sample x_t -> x_t-1 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + denoising_results.append(latents.detach().clone()) if callback_on_step_end is not None: callback_kwargs = {} @@ -467,10 +373,6 @@ def __call__( image = latents if not output_type == "latent": - # apply watermark if available - if self.watermark is not None: - image = self.watermark.apply_watermark(image) - image = self.image_processor.postprocess(image, output_type=output_type) # Offload all models @@ -479,4 +381,4 @@ def __call__( if not return_dict: return (image,) - return SecMIStableDiffusionPipelineOutput(images=image) + return SecMIStableDiffusionPipelineOutput(images=image, denoising_results=denoising_results, posterior_results=posterior_results) diff --git a/diffusers/stable_copyright/secmi_pipeline_stable_diffusion.py b/diffusers/stable_copyright/secmi_pipeline_stable_diffusion.py index 7e0e076..cd9c0fe 100644 --- a/diffusers/stable_copyright/secmi_pipeline_stable_diffusion.py +++ b/diffusers/stable_copyright/secmi_pipeline_stable_diffusion.py @@ -49,7 +49,7 @@ def prepare_inputs(self, batch, weight_dtype, device): latents = latents * 0.18215 encoder_hidden_states = self.text_encoder(input_ids)[0] - return latents, encoder_hidden_states + return latents, encoder_hidden_states, None # borrow from Image2Image