diff --git a/diffusers/scripts/exp_sdxl_demo.sh b/diffusers/scripts/exp_sdxl_demo.sh
new file mode 100755
index 0000000..602d30e
--- /dev/null
+++ b/diffusers/scripts/exp_sdxl_demo.sh
@@ -0,0 +1 @@
+python scripts/train_secmi.py --model-type sdxl --ckpt-path ../models/diffusers/Kohaku-XL-Epsilon/ --member-dataset hakubooru-2-5k-member --holdout-dataset hakubooru-2-5k-nonmember --batch-size 3 --demo True
diff --git a/diffusers/scripts/test.py b/diffusers/scripts/test.py
new file mode 100644
index 0000000..2332fcf
--- /dev/null
+++ b/diffusers/scripts/test.py
@@ -0,0 +1,10 @@
+import numpy as np
+from sklearn import preprocessing
+
+member_features = np.array([np.nan, np.inf, -np.inf, 1000, 0, 4, 30000, -899])
+membermax, membermin = member_features[~np.isposinf(member_features)].max(), member_features[~np.isneginf(member_features)].min()
+member_features = np.nan_to_num(member_features, nan=0, posinf=membermax, neginf=membermin)
+
+x = preprocessing.scale(member_features)
+x = np.nan_to_num(member_features, nan=0)
+print(x, member_features)
\ No newline at end of file
diff --git a/diffusers/scripts/train_gsa.py b/diffusers/scripts/train_gsa.py
index f879dfe..f2cc43d 100644
--- a/diffusers/scripts/train_gsa.py
+++ b/diffusers/scripts/train_gsa.py
@@ -50,9 +50,9 @@ def get_reverse_denoise_results(pipe, dataloader, device, gsa_mode, demo):
     features, path_log = [], []
     for batch_idx, batch in enumerate(tqdm.tqdm(dataloader)):
         path_log.extend(batch['path'])
-        latents, encoder_hidden_states = pipe.prepare_inputs(batch, weight_dtype, device)
+        latents, encoder_hidden_states, prompts = pipe.prepare_inputs(batch, weight_dtype, device)
         out = pipe(\
-            accelerator=accelerator, optimizer=optimizer, prompt=None, latents=latents, \
+            accelerator=accelerator, optimizer=optimizer, prompt=prompts, latents=latents, \
             prompt_embeds=encoder_hidden_states, guidance_scale=1.0, num_inference_steps=20, gsa_mode=gsa_mode)
         gsa_features = out.gsa_features # # [bsz x Tensor(num_p)]
         # print(f"gsa: {gsa_features}")
diff --git a/diffusers/scripts/train_pfami.py b/diffusers/scripts/train_pfami.py
index 22f55cd..a1aacd1 100644
--- a/diffusers/scripts/train_pfami.py
+++ b/diffusers/scripts/train_pfami.py
@@ -66,8 +66,8 @@ def get_reverse_denoise_results(pipe, dataloader, device, strengths, demo):
         for strength in strengths:
             input_batch = deepcopy(batch)
             input_batch["pixel_values"] = image_perturbation(input_batch["pixel_values"], strength)
-            latents, encoder_hidden_states = pipe.prepare_inputs(input_batch, weight_dtype, device)
-            out = pipe(prompt=None, latents=latents, prompt_embeds=encoder_hidden_states, \
+            latents, encoder_hidden_states, prompts = pipe.prepare_inputs(input_batch, weight_dtype, device)
+            out = pipe(prompt=prompts, latents=latents, prompt_embeds=encoder_hidden_states, \
                     guidance_scale=1.0, num_inference_steps=100)
             _, posterior_results, denoising_results = out.images, out.posterior_results, out.denoising_results
             # [len(attack_timesteps) x [B, 4, 64, 64]]
diff --git a/diffusers/scripts/train_pia.py b/diffusers/scripts/train_pia.py
index c4c69f1..84582a8 100644
--- a/diffusers/scripts/train_pia.py
+++ b/diffusers/scripts/train_pia.py
@@ -35,9 +35,9 @@ def get_reverse_denoise_results(pipe, dataloader, device, normalized, demo):
     scores_sum, scores_all_steps, path_log = [], [], []
     for batch_idx, batch in enumerate(tqdm.tqdm(dataloader)):
         path_log.extend(batch['path'])
-        latents, encoder_hidden_states = pipe.prepare_inputs(batch, weight_dtype, device)
+        latents, encoder_hidden_states, prompts = pipe.prepare_inputs(batch, weight_dtype, device)
         out = pipe(\
-            prompt=None, latents=latents, prompt_embeds=encoder_hidden_states, \
+            prompt=prompts, latents=latents, prompt_embeds=encoder_hidden_states, \
                 guidance_scale=1.0, num_inference_steps=100, normalized=normalized, strength=0.5)
         _, posterior_results, denoising_results = out.images, out.posterior_results, out.denoising_results
 
diff --git a/diffusers/scripts/train_secmi.py b/diffusers/scripts/train_secmi.py
index 3a3a357..7820309 100644
--- a/diffusers/scripts/train_secmi.py
+++ b/diffusers/scripts/train_secmi.py
@@ -9,7 +9,7 @@
 import argparse
 import json,time
 
-from stable_copyright import SecMILatentDiffusionPipeline, SecMIStableDiffusionPipeline, SecMIDDIMScheduler
+from stable_copyright import SecMILatentDiffusionPipeline, SecMIStableDiffusionPipeline, SecMIDDIMScheduler, SecMIStableDiffusionXLPipeline
 from stable_copyright import load_dataset, benchmark, test
 
 
@@ -22,7 +22,9 @@ def load_pipeline(ckpt_path, device='cuda:0', model_type='sd'):
         pipe = SecMILatentDiffusionPipeline.from_pretrained(ckpt_path, torch_dtype=torch.float32)
         # pipe.scheduler = SecMIDDIMScheduler.from_config(pipe.scheduler.config)
     elif model_type == 'sdxl':
-        raise NotImplementedError('SDXL not implemented yet')
+        pipe = SecMIStableDiffusionXLPipeline.from_pretrained(ckpt_path, torch_dtype=torch.float32)
+        pipe.scheduler = SecMIDDIMScheduler.from_config(pipe.scheduler.config)
+        pipe = pipe.to(device)
     else:
         raise NotImplementedError(f'Unrecognized model type {model_type}')
     return pipe
@@ -34,8 +36,8 @@ def get_reverse_denoise_results(pipe, dataloader, device, demo=False):
     scores_50_step, scores_all_steps, path_log = [], [], []
     for batch_idx, batch in enumerate(tqdm.tqdm(dataloader)):
         path_log.extend(batch['path'])
-        latents, encoder_hidden_states = pipe.prepare_inputs(batch, weight_dtype, device)
-        out = pipe(prompt=None, latents=latents, prompt_embeds=encoder_hidden_states, guidance_scale=1.0, num_inference_steps=100)
+        latents, encoder_hidden_states, prompts = pipe.prepare_inputs(batch, weight_dtype, device)
+        out = pipe(prompt=prompts, latents=latents, prompt_embeds=encoder_hidden_states, guidance_scale=1.0, num_inference_steps=100)
         _, posterior_results, denoising_results = out.images, out.posterior_results, out.denoising_results
 
         # print(f'posterior {posterior_results[0].shape}')
diff --git a/diffusers/stable_copyright/__init__.py b/diffusers/stable_copyright/__init__.py
index a187f7f..ec7d314 100644
--- a/diffusers/stable_copyright/__init__.py
+++ b/diffusers/stable_copyright/__init__.py
@@ -3,6 +3,7 @@
 from .secmi_pipeline_stable_diffusion import SecMIStableDiffusionPipeline
 from .secmi_scheduling_ddim import SecMIDDIMScheduler
 from .secmi_pipeline_latent_diffusion import SecMILatentDiffusionPipeline
+from .secmi_pipeline_sdxl import SecMIStableDiffusionXLPipeline
 
 from .pia_pipeline_stable_diffusion import PIAStableDiffusionPipeline
 from .pia_pipeline_latent_diffusion import PIALatentDiffusionPipeline
diff --git a/diffusers/stable_copyright/data_utils.py b/diffusers/stable_copyright/data_utils.py
index 0691117..b61bfa6 100644
--- a/diffusers/stable_copyright/data_utils.py
+++ b/diffusers/stable_copyright/data_utils.py
@@ -138,7 +138,8 @@ def collate_fn(examples):
     else:
         input_ids = torch.stack([example["input_ids"] for example in examples])
     path = [example["path"] for example in examples]
-    return {"pixel_values": pixel_values, "input_ids": input_ids, "path": path}
+    prompts = [example["prompt"] for example in examples]
+    return {"pixel_values": pixel_values, "input_ids": input_ids, "path": path, "prompts": prompts}
 
 class StandardTransform: 
     def __init__(self, transform: Optional[Callable] = None, target_transform: Optional[Callable] = None) -> None:
@@ -234,7 +235,7 @@ def __getitem__(self, index: int):
             image, input_id = StandardTransform(self.transforms, None)(image, input_id)
 
         # return image, target
-        return {"pixel_values": image, "input_ids": input_id, 'caption': caption, 'path': img_name}
+        return {"pixel_values": image, "input_ids": input_id, 'prompt': caption, 'path': img_name}
 
 
 def load_dataset(dataset_root, ckpt_path, dataset: str='laion-aesthetic-2-5k', batch_size: int=6, model_type='sd'):
diff --git a/diffusers/stable_copyright/gsa_pipeline_latent_diffusion.py b/diffusers/stable_copyright/gsa_pipeline_latent_diffusion.py
index 79299f5..2fd68df 100644
--- a/diffusers/stable_copyright/gsa_pipeline_latent_diffusion.py
+++ b/diffusers/stable_copyright/gsa_pipeline_latent_diffusion.py
@@ -80,7 +80,7 @@ def prepare_inputs(self, batch, weight_dtype, device):
         for param in self.unet.parameters():
             param.requires_grad = True
 
-        return latents, encoder_hidden_states
+        return latents, encoder_hidden_states, None
 
 
     # borrow from Image2Image
diff --git a/diffusers/stable_copyright/gsa_pipeline_stable_diffusion.py b/diffusers/stable_copyright/gsa_pipeline_stable_diffusion.py
index dece92a..e614040 100644
--- a/diffusers/stable_copyright/gsa_pipeline_stable_diffusion.py
+++ b/diffusers/stable_copyright/gsa_pipeline_stable_diffusion.py
@@ -33,7 +33,7 @@ def prepare_inputs(self, batch, weight_dtype, device):
         for param in self.unet.parameters():
             param.requires_grad = True
 
-        return latents, encoder_hidden_states
+        return latents, encoder_hidden_states, None
 
 
     # borrow from Image2Image
diff --git a/diffusers/stable_copyright/pfami_pipeline_latent_diffusion.py b/diffusers/stable_copyright/pfami_pipeline_latent_diffusion.py
index 4bf4bed..41acc8f 100644
--- a/diffusers/stable_copyright/pfami_pipeline_latent_diffusion.py
+++ b/diffusers/stable_copyright/pfami_pipeline_latent_diffusion.py
@@ -63,7 +63,7 @@ def prepare_inputs(self, batch, weight_dtype, device):
         latents = self.vae.encode(pixel_values)[0]
         encoder_hidden_states = None
 
-        return latents, encoder_hidden_states
+        return latents, encoder_hidden_states, None
 
 
     # borrow from Image2Image
diff --git a/diffusers/stable_copyright/pfami_pipeline_stable_diffusion.py b/diffusers/stable_copyright/pfami_pipeline_stable_diffusion.py
index 63c771e..aaaf366 100644
--- a/diffusers/stable_copyright/pfami_pipeline_stable_diffusion.py
+++ b/diffusers/stable_copyright/pfami_pipeline_stable_diffusion.py
@@ -30,7 +30,7 @@ def prepare_inputs(self, batch, weight_dtype, device):
         latents = latents * 0.18215
         encoder_hidden_states = self.text_encoder(input_ids)[0]
 
-        return latents, encoder_hidden_states
+        return latents, encoder_hidden_states, None
 
 
     # borrow from Image2Image
diff --git a/diffusers/stable_copyright/pia_pipeline_latent_diffusion.py b/diffusers/stable_copyright/pia_pipeline_latent_diffusion.py
index 374c46c..f3bc3a8 100644
--- a/diffusers/stable_copyright/pia_pipeline_latent_diffusion.py
+++ b/diffusers/stable_copyright/pia_pipeline_latent_diffusion.py
@@ -63,7 +63,7 @@ def prepare_inputs(self, batch, weight_dtype, device):
         latents = self.vae.encode(pixel_values)[0]
         encoder_hidden_states = None
 
-        return latents, encoder_hidden_states
+        return latents, encoder_hidden_states, None
 
 
     # borrow from Image2Image
diff --git a/diffusers/stable_copyright/pia_pipeline_stable_diffusion.py b/diffusers/stable_copyright/pia_pipeline_stable_diffusion.py
index be6f307..b32439c 100644
--- a/diffusers/stable_copyright/pia_pipeline_stable_diffusion.py
+++ b/diffusers/stable_copyright/pia_pipeline_stable_diffusion.py
@@ -30,7 +30,7 @@ def prepare_inputs(self, batch, weight_dtype, device):
         latents = latents * 0.18215
         encoder_hidden_states = self.text_encoder(input_ids)[0]
 
-        return latents, encoder_hidden_states
+        return latents, encoder_hidden_states, None
 
 
     # borrow from Image2Image
diff --git a/diffusers/stable_copyright/secmi_pipeline_latent_diffusion.py b/diffusers/stable_copyright/secmi_pipeline_latent_diffusion.py
index a4717d7..e2efb82 100644
--- a/diffusers/stable_copyright/secmi_pipeline_latent_diffusion.py
+++ b/diffusers/stable_copyright/secmi_pipeline_latent_diffusion.py
@@ -64,7 +64,7 @@ def prepare_inputs(self, batch, weight_dtype, device):
         latents = self.vae.encode(pixel_values)[0]
         encoder_hidden_states = None
 
-        return latents, encoder_hidden_states
+        return latents, encoder_hidden_states, None
 
 
     # borrow from Image2Image
diff --git a/diffusers/stable_copyright/secmi_pipeline_sdxl.py b/diffusers/stable_copyright/secmi_pipeline_sdxl.py
index a14277c..4775b25 100644
--- a/diffusers/stable_copyright/secmi_pipeline_sdxl.py
+++ b/diffusers/stable_copyright/secmi_pipeline_sdxl.py
@@ -21,9 +21,32 @@
 class SecMIStableDiffusionXLPipeline(
     StableDiffusionXLPipeline
 ):
-    
     @torch.no_grad()
-    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def prepare_inputs(self, batch, weight_dtype, device):
+        pixel_values, prompts = batch["pixel_values"].to(weight_dtype), batch["prompts"]
+        if device == 'cuda':
+            pixel_values, input_ids = pixel_values.cuda(), input_ids.cuda()
+
+        latents = self.vae.encode(pixel_values).latent_dist.sample()
+        latents = latents * 0.18215
+
+        return latents, None, prompts
+
+    # borrow from Image2Image
+    def get_timesteps(self, num_inference_steps, strength, device):
+        # get the original timestep using init_timestep
+        init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+
+        t_start = max(num_inference_steps - init_timestep, 0)
+        timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :]  # order=1
+        # [601, 581, ..., 21, 1]
+        if hasattr(self.scheduler, "set_begin_index"):
+            self.scheduler.set_begin_index(t_start * self.scheduler.order)
+
+        return timesteps, num_inference_steps - t_start
+
+
+    @torch.no_grad()
     def __call__(
         self,
         prompt: Union[str, List[str]] = None,
@@ -33,7 +56,7 @@ def __call__(
         num_inference_steps: int = 50,
         timesteps: List[int] = None,
         denoising_end: Optional[float] = None,
-        guidance_scale: float = 5.0,
+        guidance_scale: float = 1.0,
         negative_prompt: Optional[Union[str, List[str]]] = None,
         negative_prompt_2: Optional[Union[str, List[str]]] = None,
         num_images_per_prompt: Optional[int] = 1,
@@ -59,148 +82,9 @@ def __call__(
         clip_skip: Optional[int] = None,
         callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
         callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        strength: float=0.2,
         **kwargs,
     ):
-        r"""
-        Function invoked when calling the pipeline for generation.
-
-        Args:
-            prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
-                instead.
-            prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
-                used in both text-encoders
-            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The height in pixels of the generated image. This is set to 1024 by default for the best results.
-                Anything below 512 pixels won't work well for
-                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
-                and checkpoints that are not specifically fine-tuned on low resolutions.
-            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
-                The width in pixels of the generated image. This is set to 1024 by default for the best results.
-                Anything below 512 pixels won't work well for
-                [stabilityai/stable-diffusion-xl-base-1.0](https://huggingface.co/stabilityai/stable-diffusion-xl-base-1.0)
-                and checkpoints that are not specifically fine-tuned on low resolutions.
-            num_inference_steps (`int`, *optional*, defaults to 50):
-                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
-                expense of slower inference.
-            timesteps (`List[int]`, *optional*):
-                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
-                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
-                passed will be used. Must be in descending order.
-            denoising_end (`float`, *optional*):
-                When specified, determines the fraction (between 0.0 and 1.0) of the total denoising process to be
-                completed before it is intentionally prematurely terminated. As a result, the returned sample will
-                still retain a substantial amount of noise as determined by the discrete timesteps selected by the
-                scheduler. The denoising_end parameter should ideally be utilized when this pipeline forms a part of a
-                "Mixture of Denoisers" multi-pipeline setup, as elaborated in [**Refining the Image
-                Output**](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/stable_diffusion_xl#refining-the-image-output)
-            guidance_scale (`float`, *optional*, defaults to 5.0):
-                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
-                `guidance_scale` is defined as `w` of equation 2. of [Imagen
-                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
-                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
-                usually at the expense of lower image quality.
-            negative_prompt (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation. If not defined, one has to pass
-                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is
-                less than `1`).
-            negative_prompt_2 (`str` or `List[str]`, *optional*):
-                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
-                `text_encoder_2`. If not defined, `negative_prompt` is used in both text-encoders
-            num_images_per_prompt (`int`, *optional*, defaults to 1):
-                The number of images to generate per prompt.
-            eta (`float`, *optional*, defaults to 0.0):
-                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
-                [`schedulers.DDIMScheduler`], will be ignored for others.
-            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
-                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
-                to make generation deterministic.
-            latents (`torch.FloatTensor`, *optional*):
-                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
-                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
-                tensor will ge generated by sampling using the supplied random `generator`.
-            prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
-                provided, text embeddings will be generated from `prompt` input argument.
-            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
-                argument.
-            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
-                If not provided, pooled text embeddings will be generated from `prompt` input argument.
-            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
-                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
-                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
-                input argument.
-            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
-            ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*):
-                Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters.
-                Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding
-                if `do_classifier_free_guidance` is set to `True`.
-                If not provided, embeddings are computed from the `ip_adapter_image` input argument.
-            output_type (`str`, *optional*, defaults to `"pil"`):
-                The output format of the generate image. Choose between
-                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
-            return_dict (`bool`, *optional*, defaults to `True`):
-                Whether or not to return a [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] instead
-                of a plain tuple.
-            cross_attention_kwargs (`dict`, *optional*):
-                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
-                `self.processor` in
-                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
-            guidance_rescale (`float`, *optional*, defaults to 0.0):
-                Guidance rescale factor proposed by [Common Diffusion Noise Schedules and Sample Steps are
-                Flawed](https://arxiv.org/pdf/2305.08891.pdf) `guidance_scale` is defined as `φ` in equation 16. of
-                [Common Diffusion Noise Schedules and Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf).
-                Guidance rescale factor should fix overexposure when using zero terminal SNR.
-            original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                If `original_size` is not the same as `target_size` the image will appear to be down- or upsampled.
-                `original_size` defaults to `(height, width)` if not specified. Part of SDXL's micro-conditioning as
-                explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                `crops_coords_top_left` can be used to generate an image that appears to be "cropped" from the position
-                `crops_coords_top_left` downwards. Favorable, well-centered images are usually achieved by setting
-                `crops_coords_top_left` to (0, 0). Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                For most cases, `target_size` should be set to the desired height and width of the generated image. If
-                not specified it will default to `(height, width)`. Part of SDXL's micro-conditioning as explained in
-                section 2.2 of [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952).
-            negative_original_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                To negatively condition the generation process based on a specific image resolution. Part of SDXL's
-                micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            negative_crops_coords_top_left (`Tuple[int]`, *optional*, defaults to (0, 0)):
-                To negatively condition the generation process based on a specific crop coordinates. Part of SDXL's
-                micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            negative_target_size (`Tuple[int]`, *optional*, defaults to (1024, 1024)):
-                To negatively condition the generation process based on a target image resolution. It should be as same
-                as the `target_size` for most cases. Part of SDXL's micro-conditioning as explained in section 2.2 of
-                [https://huggingface.co/papers/2307.01952](https://huggingface.co/papers/2307.01952). For more
-                information, refer to this issue thread: https://github.com/huggingface/diffusers/issues/4208.
-            callback_on_step_end (`Callable`, *optional*):
-                A function that calls at the end of each denoising steps during the inference. The function is called
-                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
-                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
-                `callback_on_step_end_tensor_inputs`.
-            callback_on_step_end_tensor_inputs (`List`, *optional*):
-                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
-                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
-                `._callback_tensor_inputs` attribute of your pipeline class.
-
-        Examples:
-
-        Returns:
-            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] or `tuple`:
-            [`~pipelines.stable_diffusion_xl.StableDiffusionXLPipelineOutput`] if `return_dict` is True, otherwise a
-            `tuple`. When returning a tuple, the first element is a list with the generated images.
-        """
 
         callback = kwargs.pop("callback", None)
         callback_steps = kwargs.pop("callback_steps", None)
@@ -226,6 +110,8 @@ def __call__(
         target_size = target_size or (height, width)
 
         # 1. Check inputs. Raise error if not correct
+        if pooled_prompt_embeds == None:
+            prompt_embeds, pooled_prompt_embeds = prompt_embeds
         self.check_inputs(
             prompt,
             prompt_2,
@@ -288,6 +174,7 @@ def __call__(
 
         # 4. Prepare timesteps
         timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
 
         # 5. Prepare latent variables
         num_channels_latents = self.unet.config.in_channels
@@ -330,24 +217,10 @@ def __call__(
         else:
             negative_add_time_ids = add_time_ids
 
-        if self.do_classifier_free_guidance:
-            prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
-            add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
-            add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
-
         prompt_embeds = prompt_embeds.to(device)
         add_text_embeds = add_text_embeds.to(device)
         add_time_ids = add_time_ids.to(device).repeat(batch_size * num_images_per_prompt, 1)
 
-        if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
-            image_embeds = self.prepare_ip_adapter_image_embeds(
-                ip_adapter_image,
-                ip_adapter_image_embeds,
-                device,
-                batch_size * num_images_per_prompt,
-                self.do_classifier_free_guidance,
-            )
-
         # 8. Denoising loop
         num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
 
@@ -375,21 +248,53 @@ def __call__(
                 guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
             ).to(device=device, dtype=latents.dtype)
 
+        # SECMI
+        # get [x_201, x_181, ..., x_1]
+        print(timesteps)
+        posterior_results = []
+        original_latents = latents.detach().clone()
+        for i, t in enumerate(timesteps): # from t_max to t_min
+            noise = randn_tensor(original_latents.shape, generator=generator, device=device, dtype=original_latents.dtype)
+            posterior_latents = self.scheduler.scale_model_input(original_latents, t)
+            posterior_latents = self.scheduler.add_noise(posterior_latents, noise, t)
+            posterior_results.append(posterior_latents.detach().clone())
+            # print(f"{t} timestep posterior: {torch.sum(posterior_latents)}")
+
+        # get [x_(201+20), x_(181+20), ..., x_(1+20)]
+        reverse_results = []
         self._num_timesteps = len(timesteps)
+        for i, t in enumerate(timesteps):  # from t_max to t_min
+            latent_model_input = self.scheduler.scale_model_input(posterior_results[i], t)
+            # predict the noise residual
+            added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
+            noise_pred = self.unet(
+                latent_model_input,
+                t,
+                encoder_hidden_states=prompt_embeds,
+                timestep_cond=timestep_cond,
+                cross_attention_kwargs=self.cross_attention_kwargs,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+            # compute the previous noisy sample x_t -> x_t-1
+            reverse_latents = self.scheduler.reverse_step(noise_pred, t, latent_model_input, **extra_step_kwargs, return_dict=False)[0]
+            reverse_results.append(reverse_latents.detach().clone())
+
+        unit_t = timesteps[0] - timesteps[1]
+        denoising_results = []
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 if self.interrupt:
                     continue
+                latents = reverse_results[i]
+                t = t + unit_t
 
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
-
+                latent_model_input = latents
                 latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
 
                 # predict the noise residual
                 added_cond_kwargs = {"text_embeds": add_text_embeds, "time_ids": add_time_ids}
-                if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
-                    added_cond_kwargs["image_embeds"] = image_embeds
                 noise_pred = self.unet(
                     latent_model_input,
                     t,
@@ -411,6 +316,7 @@ def __call__(
 
                 # compute the previous noisy sample x_t -> x_t-1
                 latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+                denoising_results.append(latents.detach().clone())
 
                 if callback_on_step_end is not None:
                     callback_kwargs = {}
@@ -467,10 +373,6 @@ def __call__(
             image = latents
 
         if not output_type == "latent":
-            # apply watermark if available
-            if self.watermark is not None:
-                image = self.watermark.apply_watermark(image)
-
             image = self.image_processor.postprocess(image, output_type=output_type)
 
         # Offload all models
@@ -479,4 +381,4 @@ def __call__(
         if not return_dict:
             return (image,)
 
-        return SecMIStableDiffusionPipelineOutput(images=image)
+        return SecMIStableDiffusionPipelineOutput(images=image, denoising_results=denoising_results, posterior_results=posterior_results)
diff --git a/diffusers/stable_copyright/secmi_pipeline_stable_diffusion.py b/diffusers/stable_copyright/secmi_pipeline_stable_diffusion.py
index 7e0e076..cd9c0fe 100644
--- a/diffusers/stable_copyright/secmi_pipeline_stable_diffusion.py
+++ b/diffusers/stable_copyright/secmi_pipeline_stable_diffusion.py
@@ -49,7 +49,7 @@ def prepare_inputs(self, batch, weight_dtype, device):
         latents = latents * 0.18215
         encoder_hidden_states = self.text_encoder(input_ids)[0]
 
-        return latents, encoder_hidden_states
+        return latents, encoder_hidden_states, None
 
 
     # borrow from Image2Image