From e73a08f58bd12bb39d2d0a8de694311ca254532b Mon Sep 17 00:00:00 2001
From: chengzeyi <ichengzeyi@gmail.com>
Date: Mon, 13 Jan 2025 18:42:17 +0800
Subject: [PATCH] support SD3.5

---
 README.md            |   6 +-
 first_block_cache.py |  37 ++-
 pyproject.toml       |   2 +-
 workflows/sd3.5.json | 657 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 685 insertions(+), 17 deletions(-)
 create mode 100644 workflows/sd3.5.json

diff --git a/README.md b/README.md
index 3f43f0f..baf2ba2 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,7 @@ You can find demo workflows in the `workflows` folder.
 | FLUX.1-dev ControlNet with First Block Cache and Compilation | [workflows/flux_controlnet.json](./workflows/flux_controlnet.json)
 | LTXV with First Block Cache and Compilation | [workflows/ltxv.json](./workflows/ltxv.json)
 | HunyuanVideo with First Block Cache | [workflows/hunyuan_video.json](./workflows/hunyuan_video.json)
+| SD3.5 with First Block Cache and Compilation | [workflows/sd3.5.json](./workflows/sd3.5.json)
 | SDXL with First Block Cache | [workflows/sdxl.json](./workflows/sdxl.json)
 
 **NOTE**: The `Compile Model+` node requires your computation to meet some software and hardware requirements, please refer to the [Enhanced `torch.compile`](#enhanced-torchcompile) section for more information.
@@ -54,6 +55,8 @@ This can significantly reduce the computation cost of the model, achieving a spe
 To use first block cache, simply add the `wavespeed->Apply First Block Cache` node to your workflow after your `Load Diffusion Model` node and adjust the `residual_diff_threashold` value to a suitable value for your model, for example: `0.12` for `flux-dev.safetensors` with `fp8_e4m3fn_fast` and 28 steps.
 It is expected to see a speedup of 1.5x to 3.0x with acceptable accuracy loss.
 
+It supports many models like `FLUX`, `LTXV (native and non-native)`, `HunyuanVideo (native)`, `SD3.5` and `SDXL`, feel free to try it out and let us know if you have any issues!
+
 Some configurations for different models that you can try:
 
 | Model | Steps | `residual_diff_threashold` |
@@ -61,10 +64,9 @@ Some configurations for different models that you can try:
 | `flux-dev.safetensors` with `fp8_e4m3fn_fast` | 28 | 0.12 |
 | `ltx-video-2b-v0.9.1.safetensors` | 30 | 0.1 |
 | `hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors` | 20 | 0.1 |
+| `sd3.5_large_fp8_scaled.safetensors` | 30 | 0.12 |
 | `sd_xl_base_1.0.safetensors` | 25 | 0.2 |
 
-It supports many models like `FLUX`, `LTXV (native and non-native)`, `HunyuanVideo (native)` and `SDXL`, feel free to try it out and let us know if you have any issues!
-
 See [Apply First Block Cache on FLUX.1-dev](https://github.com/chengzeyi/ParaAttention/blob/main/doc/fastest_flux.md#apply-first-block-cache-on-flux1-dev) for more information and detailed comparison on quality and speed.
 
 ![Usage of First Block Cache](./assets/usage_fbcache.png)
diff --git a/first_block_cache.py b/first_block_cache.py
index 0d75f57..a9a24e8 100644
--- a/first_block_cache.py
+++ b/first_block_cache.py
@@ -121,9 +121,11 @@ def apply_prev_hidden_states_residual(hidden_states,
 
     encoder_hidden_states_residual = get_buffer(
         "encoder_hidden_states_residual")
-    assert encoder_hidden_states_residual is not None, "encoder_hidden_states_residual must be set before"
-    encoder_hidden_states = encoder_hidden_states_residual + encoder_hidden_states
-    encoder_hidden_states = encoder_hidden_states.contiguous()
+    if encoder_hidden_states_residual is None:
+        encoder_hidden_states = None
+    else:
+        encoder_hidden_states = encoder_hidden_states_residual + encoder_hidden_states
+        encoder_hidden_states = encoder_hidden_states.contiguous()
 
     return hidden_states, encoder_hidden_states
 
@@ -294,8 +296,9 @@ def forward(self, *args, **kwargs):
                 txt_arg_name=txt_arg_name,
                 **kwargs)
             set_buffer("hidden_states_residual", hidden_states_residual)
-            set_buffer("encoder_hidden_states_residual",
-                       encoder_hidden_states_residual)
+            if encoder_hidden_states_residual is not None:
+                set_buffer("encoder_hidden_states_residual",
+                           encoder_hidden_states_residual)
         torch._dynamo.graph_break()
 
         if self.return_hidden_states_only:
@@ -359,15 +362,19 @@ def call_remaining_transformer_blocks(self,
                     dim=1)
 
         hidden_states_shape = hidden_states.shape
-        encoder_hidden_states_shape = encoder_hidden_states.shape
-
         hidden_states = hidden_states.flatten().contiguous().reshape(
             hidden_states_shape)
-        encoder_hidden_states = encoder_hidden_states.flatten().contiguous(
-        ).reshape(encoder_hidden_states_shape)
+
+        if encoder_hidden_states is not None:
+            encoder_hidden_states_shape = encoder_hidden_states.shape
+            encoder_hidden_states = encoder_hidden_states.flatten().contiguous(
+            ).reshape(encoder_hidden_states_shape)
 
         hidden_states_residual = hidden_states - original_hidden_states
-        encoder_hidden_states_residual = encoder_hidden_states - original_encoder_hidden_states
+        if encoder_hidden_states is None:
+            encoder_hidden_states_residual = None
+        else:
+            encoder_hidden_states_residual = encoder_hidden_states - original_encoder_hidden_states
         return hidden_states, encoder_hidden_states, hidden_states_residual, encoder_hidden_states_residual
 
 
@@ -557,8 +564,8 @@ def create_patch_flux_forward_orig(model,
     from torch import Tensor
     from comfy.ldm.flux.model import timestep_embedding
 
-    def call_remaining_blocks(self, blocks_replace, control, img, txt, vec,
-                              pe, attn_mask):
+    def call_remaining_blocks(self, blocks_replace, control, img, txt, vec, pe,
+                              attn_mask):
         original_hidden_states = img
 
         for i, block in enumerate(self.double_blocks):
@@ -725,7 +732,8 @@ def block_wrap(args):
                     threshold=residual_diff_threshold,
                 )
                 if validate_can_use_cache_function is not None:
-                    can_use_cache = validate_can_use_cache_function(can_use_cache)
+                    can_use_cache = validate_can_use_cache_function(
+                        can_use_cache)
                 if not can_use_cache:
                     set_buffer("first_hidden_states_residual",
                                first_hidden_states_residual)
@@ -756,7 +764,8 @@ def block_wrap(args):
 
     @contextlib.contextmanager
     def patch_forward_orig():
-        with unittest.mock.patch.object(model, "forward_orig", new_forward_orig):
+        with unittest.mock.patch.object(model, "forward_orig",
+                                        new_forward_orig):
             yield
 
     return patch_forward_orig
diff --git a/pyproject.toml b/pyproject.toml
index c1a1114..e8053a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,7 +1,7 @@
 [project]
 name = "wavespeed"
 description = "The all in one inference optimization solution for ComfyUI, universal, flexible, and fast."
-version = "1.1.3"
+version = "1.1.4"
 license = {file = "LICENSE"}
 
 [project.urls]
diff --git a/workflows/sd3.5.json b/workflows/sd3.5.json
new file mode 100644
index 0000000..5da8db3
--- /dev/null
+++ b/workflows/sd3.5.json
@@ -0,0 +1,657 @@
+{
+  "last_node_id": 55,
+  "last_link_id": 104,
+  "nodes": [
+    {
+      "id": 3,
+      "type": "KSampler",
+      "pos": [
+        864,
+        96
+      ],
+      "size": [
+        315,
+        262
+      ],
+      "flags": {},
+      "order": 11,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 104,
+          "slot_index": 0
+        },
+        {
+          "name": "positive",
+          "type": "CONDITIONING",
+          "link": 21
+        },
+        {
+          "name": "negative",
+          "type": "CONDITIONING",
+          "link": 80
+        },
+        {
+          "name": "latent_image",
+          "type": "LATENT",
+          "link": 100
+        }
+      ],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            7
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "KSampler"
+      },
+      "widgets_values": [
+        0,
+        "fixed",
+        30,
+        5.45,
+        "euler",
+        "sgm_uniform",
+        1
+      ]
+    },
+    {
+      "id": 4,
+      "type": "CheckpointLoaderSimple",
+      "pos": [
+        -96,
+        480
+      ],
+      "size": [
+        384.75592041015625,
+        98
+      ],
+      "flags": {},
+      "order": 6,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "MODEL",
+          "type": "MODEL",
+          "links": [
+            101
+          ],
+          "slot_index": 0
+        },
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [],
+          "slot_index": 1
+        },
+        {
+          "name": "VAE",
+          "type": "VAE",
+          "links": [
+            53
+          ],
+          "slot_index": 2
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CheckpointLoaderSimple"
+      },
+      "widgets_values": [
+        "sd3.5_large_fp8_scaled.safetensors"
+      ]
+    },
+    {
+      "id": 8,
+      "type": "VAEDecode",
+      "pos": [
+        1200,
+        96
+      ],
+      "size": [
+        210,
+        46
+      ],
+      "flags": {},
+      "order": 12,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "samples",
+          "type": "LATENT",
+          "link": 7
+        },
+        {
+          "name": "vae",
+          "type": "VAE",
+          "link": 53,
+          "slot_index": 1
+        }
+      ],
+      "outputs": [
+        {
+          "name": "IMAGE",
+          "type": "IMAGE",
+          "links": [
+            51
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "VAEDecode"
+      },
+      "widgets_values": []
+    },
+    {
+      "id": 9,
+      "type": "SaveImage",
+      "pos": [
+        1440,
+        96
+      ],
+      "size": [
+        952.5112915039062,
+        1007.9328002929688
+      ],
+      "flags": {},
+      "order": 13,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "images",
+          "type": "IMAGE",
+          "link": 51,
+          "slot_index": 0
+        }
+      ],
+      "outputs": [],
+      "properties": {},
+      "widgets_values": [
+        "ComfyUI"
+      ]
+    },
+    {
+      "id": 16,
+      "type": "CLIPTextEncode",
+      "pos": [
+        384,
+        96
+      ],
+      "size": [
+        432,
+        192
+      ],
+      "flags": {},
+      "order": 7,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 96
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            21
+          ],
+          "slot_index": 0
+        }
+      ],
+      "title": "Positive Prompt",
+      "properties": {
+        "Node name for S&R": "CLIPTextEncode"
+      },
+      "widgets_values": [
+        "a bottle with a rainbow galaxy inside it on top of a wooden table on a snowy mountain top with the ocean and clouds in the background"
+      ],
+      "color": "#232",
+      "bgcolor": "#353"
+    },
+    {
+      "id": 40,
+      "type": "CLIPTextEncode",
+      "pos": [
+        384,
+        336
+      ],
+      "size": [
+        432,
+        192
+      ],
+      "flags": {},
+      "order": 8,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "clip",
+          "type": "CLIP",
+          "link": 97
+        }
+      ],
+      "outputs": [
+        {
+          "name": "CONDITIONING",
+          "type": "CONDITIONING",
+          "links": [
+            80
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "title": "Negative Prompt",
+      "properties": {
+        "Node name for S&R": "CLIPTextEncode"
+      },
+      "widgets_values": [
+        ""
+      ],
+      "color": "#322",
+      "bgcolor": "#533"
+    },
+    {
+      "id": 41,
+      "type": "CLIPLoader",
+      "pos": [
+        -96,
+        0
+      ],
+      "size": [
+        315,
+        82
+      ],
+      "flags": {},
+      "order": 0,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "CLIPLoader"
+      },
+      "widgets_values": [
+        "t5xxl_fp8_e4m3fn.safetensors",
+        "sd3",
+        "default"
+      ]
+    },
+    {
+      "id": 42,
+      "type": "DualCLIPLoader",
+      "pos": [
+        -96,
+        144
+      ],
+      "size": [
+        315,
+        106
+      ],
+      "flags": {},
+      "order": 1,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "DualCLIPLoader"
+      },
+      "widgets_values": [
+        "clip_l.safetensors",
+        "clip_g.safetensors",
+        "sd3",
+        "default"
+      ]
+    },
+    {
+      "id": 43,
+      "type": "TripleCLIPLoader",
+      "pos": [
+        -96,
+        288
+      ],
+      "size": [
+        315,
+        106
+      ],
+      "flags": {},
+      "order": 5,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "CLIP",
+          "type": "CLIP",
+          "links": [
+            96,
+            97
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "TripleCLIPLoader"
+      },
+      "widgets_values": [
+        "clip_l.safetensors",
+        "clip_g.safetensors",
+        "t5xxl_fp8_e4m3fn.safetensors"
+      ]
+    },
+    {
+      "id": 50,
+      "type": "Note",
+      "pos": [
+        -384,
+        144
+      ],
+      "size": [
+        223.34756469726562,
+        254.37765502929688
+      ],
+      "flags": {},
+      "order": 2,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "SD3 supports different text encoder configurations, you can see how to load them here.\n\n\nMake sure to put these files:\nclip_g.safetensors\nclip_l.safetensors\nt5xxl_fp8.safetensors\n\n\nIn the ComfyUI/models/clip directory"
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 51,
+      "type": "Note",
+      "pos": [
+        -96,
+        624
+      ],
+      "size": [
+        384,
+        192
+      ],
+      "flags": {},
+      "order": 3,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [],
+      "properties": {
+        "text": ""
+      },
+      "widgets_values": [
+        "sd3.5_large_fp8.safetensors is the file that does not contain any CLIP/text encoder weights so you need to load them separately.\n\nThis file goes in the ComfyUI/models/checkpoints directory."
+      ],
+      "color": "#432",
+      "bgcolor": "#653"
+    },
+    {
+      "id": 53,
+      "type": "EmptySD3LatentImage",
+      "pos": [
+        480,
+        576
+      ],
+      "size": [
+        315,
+        106
+      ],
+      "flags": {},
+      "order": 4,
+      "mode": 0,
+      "inputs": [],
+      "outputs": [
+        {
+          "name": "LATENT",
+          "type": "LATENT",
+          "links": [
+            100
+          ],
+          "slot_index": 0,
+          "shape": 3
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "EmptySD3LatentImage"
+      },
+      "widgets_values": [
+        1024,
+        1024,
+        1
+      ]
+    },
+    {
+      "id": 54,
+      "type": "ApplyFBCacheOnModel",
+      "pos": [
+        340,
+        750
+      ],
+      "size": [
+        315,
+        154
+      ],
+      "flags": {},
+      "order": 9,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "MODEL",
+          "link": 101
+        }
+      ],
+      "outputs": [
+        {
+          "name": "MODEL",
+          "type": "MODEL",
+          "links": [
+            103
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "ApplyFBCacheOnModel"
+      },
+      "widgets_values": [
+        "diffusion_model",
+        0.12,
+        0,
+        1,
+        -1
+      ]
+    },
+    {
+      "id": 55,
+      "type": "EnhancedCompileModel",
+      "pos": [
+        730,
+        750
+      ],
+      "size": [
+        400,
+        294
+      ],
+      "flags": {},
+      "order": 10,
+      "mode": 0,
+      "inputs": [
+        {
+          "name": "model",
+          "type": "*",
+          "link": 103
+        }
+      ],
+      "outputs": [
+        {
+          "name": "*",
+          "type": "*",
+          "links": [
+            104
+          ],
+          "slot_index": 0
+        }
+      ],
+      "properties": {
+        "Node name for S&R": "EnhancedCompileModel"
+      },
+      "widgets_values": [
+        true,
+        "diffusion_model",
+        "torch.compile",
+        false,
+        false,
+        "",
+        "",
+        false,
+        "inductor"
+      ]
+    }
+  ],
+  "links": [
+    [
+      7,
+      3,
+      0,
+      8,
+      0,
+      "LATENT"
+    ],
+    [
+      21,
+      16,
+      0,
+      3,
+      1,
+      "CONDITIONING"
+    ],
+    [
+      51,
+      8,
+      0,
+      9,
+      0,
+      "IMAGE"
+    ],
+    [
+      53,
+      4,
+      2,
+      8,
+      1,
+      "VAE"
+    ],
+    [
+      80,
+      40,
+      0,
+      3,
+      2,
+      "CONDITIONING"
+    ],
+    [
+      96,
+      43,
+      0,
+      16,
+      0,
+      "CLIP"
+    ],
+    [
+      97,
+      43,
+      0,
+      40,
+      0,
+      "CLIP"
+    ],
+    [
+      100,
+      53,
+      0,
+      3,
+      3,
+      "LATENT"
+    ],
+    [
+      101,
+      4,
+      0,
+      54,
+      0,
+      "MODEL"
+    ],
+    [
+      103,
+      54,
+      0,
+      55,
+      0,
+      "*"
+    ],
+    [
+      104,
+      55,
+      0,
+      3,
+      0,
+      "MODEL"
+    ]
+  ],
+  "groups": [
+    {
+      "id": 1,
+      "title": "Different Text Encoder Configurations",
+      "bounding": [
+        -140,
+        -100,
+        480,
+        528
+      ],
+      "color": "#3f789e",
+      "font_size": 24,
+      "flags": {}
+    }
+  ],
+  "config": {},
+  "extra": {
+    "ds": {
+      "scale": 0.6830134553650711,
+      "offset": [
+        -94.64810292225889,
+        94.43701306285806
+      ]
+    }
+  },
+  "version": 0.4
+}
\ No newline at end of file