From e73a08f58bd12bb39d2d0a8de694311ca254532b Mon Sep 17 00:00:00 2001 From: chengzeyi Date: Mon, 13 Jan 2025 18:42:17 +0800 Subject: [PATCH] support SD3.5 --- README.md | 6 +- first_block_cache.py | 37 ++- pyproject.toml | 2 +- workflows/sd3.5.json | 657 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 685 insertions(+), 17 deletions(-) create mode 100644 workflows/sd3.5.json diff --git a/README.md b/README.md index 3f43f0f..baf2ba2 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,7 @@ You can find demo workflows in the `workflows` folder. | FLUX.1-dev ControlNet with First Block Cache and Compilation | [workflows/flux_controlnet.json](./workflows/flux_controlnet.json) | LTXV with First Block Cache and Compilation | [workflows/ltxv.json](./workflows/ltxv.json) | HunyuanVideo with First Block Cache | [workflows/hunyuan_video.json](./workflows/hunyuan_video.json) +| SD3.5 with First Block Cache and Compilation | [workflows/sd3.5.json](./workflows/sd3.5.json) | SDXL with First Block Cache | [workflows/sdxl.json](./workflows/sdxl.json) **NOTE**: The `Compile Model+` node requires your computation to meet some software and hardware requirements, please refer to the [Enhanced `torch.compile`](#enhanced-torchcompile) section for more information. @@ -54,6 +55,8 @@ This can significantly reduce the computation cost of the model, achieving a spe To use first block cache, simply add the `wavespeed->Apply First Block Cache` node to your workflow after your `Load Diffusion Model` node and adjust the `residual_diff_threashold` value to a suitable value for your model, for example: `0.12` for `flux-dev.safetensors` with `fp8_e4m3fn_fast` and 28 steps. It is expected to see a speedup of 1.5x to 3.0x with acceptable accuracy loss. +It supports many models like `FLUX`, `LTXV (native and non-native)`, `HunyuanVideo (native)`, `SD3.5` and `SDXL`, feel free to try it out and let us know if you have any issues! + Some configurations for different models that you can try: | Model | Steps | `residual_diff_threashold` | @@ -61,10 +64,9 @@ Some configurations for different models that you can try: | `flux-dev.safetensors` with `fp8_e4m3fn_fast` | 28 | 0.12 | | `ltx-video-2b-v0.9.1.safetensors` | 30 | 0.1 | | `hunyuan_video_720_cfgdistill_fp8_e4m3fn.safetensors` | 20 | 0.1 | +| `sd3.5_large_fp8_scaled.safetensors` | 30 | 0.12 | | `sd_xl_base_1.0.safetensors` | 25 | 0.2 | -It supports many models like `FLUX`, `LTXV (native and non-native)`, `HunyuanVideo (native)` and `SDXL`, feel free to try it out and let us know if you have any issues! - See [Apply First Block Cache on FLUX.1-dev](https://github.com/chengzeyi/ParaAttention/blob/main/doc/fastest_flux.md#apply-first-block-cache-on-flux1-dev) for more information and detailed comparison on quality and speed. ![Usage of First Block Cache](./assets/usage_fbcache.png) diff --git a/first_block_cache.py b/first_block_cache.py index 0d75f57..a9a24e8 100644 --- a/first_block_cache.py +++ b/first_block_cache.py @@ -121,9 +121,11 @@ def apply_prev_hidden_states_residual(hidden_states, encoder_hidden_states_residual = get_buffer( "encoder_hidden_states_residual") - assert encoder_hidden_states_residual is not None, "encoder_hidden_states_residual must be set before" - encoder_hidden_states = encoder_hidden_states_residual + encoder_hidden_states - encoder_hidden_states = encoder_hidden_states.contiguous() + if encoder_hidden_states_residual is None: + encoder_hidden_states = None + else: + encoder_hidden_states = encoder_hidden_states_residual + encoder_hidden_states + encoder_hidden_states = encoder_hidden_states.contiguous() return hidden_states, encoder_hidden_states @@ -294,8 +296,9 @@ def forward(self, *args, **kwargs): txt_arg_name=txt_arg_name, **kwargs) set_buffer("hidden_states_residual", hidden_states_residual) - set_buffer("encoder_hidden_states_residual", - encoder_hidden_states_residual) + if encoder_hidden_states_residual is not None: + set_buffer("encoder_hidden_states_residual", + encoder_hidden_states_residual) torch._dynamo.graph_break() if self.return_hidden_states_only: @@ -359,15 +362,19 @@ def call_remaining_transformer_blocks(self, dim=1) hidden_states_shape = hidden_states.shape - encoder_hidden_states_shape = encoder_hidden_states.shape - hidden_states = hidden_states.flatten().contiguous().reshape( hidden_states_shape) - encoder_hidden_states = encoder_hidden_states.flatten().contiguous( - ).reshape(encoder_hidden_states_shape) + + if encoder_hidden_states is not None: + encoder_hidden_states_shape = encoder_hidden_states.shape + encoder_hidden_states = encoder_hidden_states.flatten().contiguous( + ).reshape(encoder_hidden_states_shape) hidden_states_residual = hidden_states - original_hidden_states - encoder_hidden_states_residual = encoder_hidden_states - original_encoder_hidden_states + if encoder_hidden_states is None: + encoder_hidden_states_residual = None + else: + encoder_hidden_states_residual = encoder_hidden_states - original_encoder_hidden_states return hidden_states, encoder_hidden_states, hidden_states_residual, encoder_hidden_states_residual @@ -557,8 +564,8 @@ def create_patch_flux_forward_orig(model, from torch import Tensor from comfy.ldm.flux.model import timestep_embedding - def call_remaining_blocks(self, blocks_replace, control, img, txt, vec, - pe, attn_mask): + def call_remaining_blocks(self, blocks_replace, control, img, txt, vec, pe, + attn_mask): original_hidden_states = img for i, block in enumerate(self.double_blocks): @@ -725,7 +732,8 @@ def block_wrap(args): threshold=residual_diff_threshold, ) if validate_can_use_cache_function is not None: - can_use_cache = validate_can_use_cache_function(can_use_cache) + can_use_cache = validate_can_use_cache_function( + can_use_cache) if not can_use_cache: set_buffer("first_hidden_states_residual", first_hidden_states_residual) @@ -756,7 +764,8 @@ def block_wrap(args): @contextlib.contextmanager def patch_forward_orig(): - with unittest.mock.patch.object(model, "forward_orig", new_forward_orig): + with unittest.mock.patch.object(model, "forward_orig", + new_forward_orig): yield return patch_forward_orig diff --git a/pyproject.toml b/pyproject.toml index c1a1114..e8053a7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "wavespeed" description = "The all in one inference optimization solution for ComfyUI, universal, flexible, and fast." -version = "1.1.3" +version = "1.1.4" license = {file = "LICENSE"} [project.urls] diff --git a/workflows/sd3.5.json b/workflows/sd3.5.json new file mode 100644 index 0000000..5da8db3 --- /dev/null +++ b/workflows/sd3.5.json @@ -0,0 +1,657 @@ +{ + "last_node_id": 55, + "last_link_id": 104, + "nodes": [ + { + "id": 3, + "type": "KSampler", + "pos": [ + 864, + 96 + ], + "size": [ + 315, + 262 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "MODEL", + "link": 104, + "slot_index": 0 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 21 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 80 + }, + { + "name": "latent_image", + "type": "LATENT", + "link": 100 + } + ], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [ + 7 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "KSampler" + }, + "widgets_values": [ + 0, + "fixed", + 30, + 5.45, + "euler", + "sgm_uniform", + 1 + ] + }, + { + "id": 4, + "type": "CheckpointLoaderSimple", + "pos": [ + -96, + 480 + ], + "size": [ + 384.75592041015625, + 98 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "MODEL", + "type": "MODEL", + "links": [ + 101 + ], + "slot_index": 0 + }, + { + "name": "CLIP", + "type": "CLIP", + "links": [], + "slot_index": 1 + }, + { + "name": "VAE", + "type": "VAE", + "links": [ + 53 + ], + "slot_index": 2 + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple" + }, + "widgets_values": [ + "sd3.5_large_fp8_scaled.safetensors" + ] + }, + { + "id": 8, + "type": "VAEDecode", + "pos": [ + 1200, + 96 + ], + "size": [ + 210, + 46 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "name": "samples", + "type": "LATENT", + "link": 7 + }, + { + "name": "vae", + "type": "VAE", + "link": 53, + "slot_index": 1 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 51 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "VAEDecode" + }, + "widgets_values": [] + }, + { + "id": 9, + "type": "SaveImage", + "pos": [ + 1440, + 96 + ], + "size": [ + 952.5112915039062, + 1007.9328002929688 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 51, + "slot_index": 0 + } + ], + "outputs": [], + "properties": {}, + "widgets_values": [ + "ComfyUI" + ] + }, + { + "id": 16, + "type": "CLIPTextEncode", + "pos": [ + 384, + 96 + ], + "size": [ + 432, + 192 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 96 + } + ], + "outputs": [ + { + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 21 + ], + "slot_index": 0 + } + ], + "title": "Positive Prompt", + "properties": { + "Node name for S&R": "CLIPTextEncode" + }, + "widgets_values": [ + "a bottle with a rainbow galaxy inside it on top of a wooden table on a snowy mountain top with the ocean and clouds in the background" + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 40, + "type": "CLIPTextEncode", + "pos": [ + 384, + 336 + ], + "size": [ + 432, + 192 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 97 + } + ], + "outputs": [ + { + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 80 + ], + "slot_index": 0, + "shape": 3 + } + ], + "title": "Negative Prompt", + "properties": { + "Node name for S&R": "CLIPTextEncode" + }, + "widgets_values": [ + "" + ], + "color": "#322", + "bgcolor": "#533" + }, + { + "id": 41, + "type": "CLIPLoader", + "pos": [ + -96, + 0 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "CLIPLoader" + }, + "widgets_values": [ + "t5xxl_fp8_e4m3fn.safetensors", + "sd3", + "default" + ] + }, + { + "id": 42, + "type": "DualCLIPLoader", + "pos": [ + -96, + 144 + ], + "size": [ + 315, + 106 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "DualCLIPLoader" + }, + "widgets_values": [ + "clip_l.safetensors", + "clip_g.safetensors", + "sd3", + "default" + ] + }, + { + "id": 43, + "type": "TripleCLIPLoader", + "pos": [ + -96, + 288 + ], + "size": [ + 315, + 106 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 96, + 97 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "TripleCLIPLoader" + }, + "widgets_values": [ + "clip_l.safetensors", + "clip_g.safetensors", + "t5xxl_fp8_e4m3fn.safetensors" + ] + }, + { + "id": 50, + "type": "Note", + "pos": [ + -384, + 144 + ], + "size": [ + 223.34756469726562, + 254.37765502929688 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [], + "properties": { + "text": "" + }, + "widgets_values": [ + "SD3 supports different text encoder configurations, you can see how to load them here.\n\n\nMake sure to put these files:\nclip_g.safetensors\nclip_l.safetensors\nt5xxl_fp8.safetensors\n\n\nIn the ComfyUI/models/clip directory" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 51, + "type": "Note", + "pos": [ + -96, + 624 + ], + "size": [ + 384, + 192 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [], + "properties": { + "text": "" + }, + "widgets_values": [ + "sd3.5_large_fp8.safetensors is the file that does not contain any CLIP/text encoder weights so you need to load them separately.\n\nThis file goes in the ComfyUI/models/checkpoints directory." + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 53, + "type": "EmptySD3LatentImage", + "pos": [ + 480, + 576 + ], + "size": [ + 315, + 106 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [ + 100 + ], + "slot_index": 0, + "shape": 3 + } + ], + "properties": { + "Node name for S&R": "EmptySD3LatentImage" + }, + "widgets_values": [ + 1024, + 1024, + 1 + ] + }, + { + "id": 54, + "type": "ApplyFBCacheOnModel", + "pos": [ + 340, + 750 + ], + "size": [ + 315, + 154 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "MODEL", + "link": 101 + } + ], + "outputs": [ + { + "name": "MODEL", + "type": "MODEL", + "links": [ + 103 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ApplyFBCacheOnModel" + }, + "widgets_values": [ + "diffusion_model", + 0.12, + 0, + 1, + -1 + ] + }, + { + "id": 55, + "type": "EnhancedCompileModel", + "pos": [ + 730, + 750 + ], + "size": [ + 400, + 294 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "*", + "link": 103 + } + ], + "outputs": [ + { + "name": "*", + "type": "*", + "links": [ + 104 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "EnhancedCompileModel" + }, + "widgets_values": [ + true, + "diffusion_model", + "torch.compile", + false, + false, + "", + "", + false, + "inductor" + ] + } + ], + "links": [ + [ + 7, + 3, + 0, + 8, + 0, + "LATENT" + ], + [ + 21, + 16, + 0, + 3, + 1, + "CONDITIONING" + ], + [ + 51, + 8, + 0, + 9, + 0, + "IMAGE" + ], + [ + 53, + 4, + 2, + 8, + 1, + "VAE" + ], + [ + 80, + 40, + 0, + 3, + 2, + "CONDITIONING" + ], + [ + 96, + 43, + 0, + 16, + 0, + "CLIP" + ], + [ + 97, + 43, + 0, + 40, + 0, + "CLIP" + ], + [ + 100, + 53, + 0, + 3, + 3, + "LATENT" + ], + [ + 101, + 4, + 0, + 54, + 0, + "MODEL" + ], + [ + 103, + 54, + 0, + 55, + 0, + "*" + ], + [ + 104, + 55, + 0, + 3, + 0, + "MODEL" + ] + ], + "groups": [ + { + "id": 1, + "title": "Different Text Encoder Configurations", + "bounding": [ + -140, + -100, + 480, + 528 + ], + "color": "#3f789e", + "font_size": 24, + "flags": {} + } + ], + "config": {}, + "extra": { + "ds": { + "scale": 0.6830134553650711, + "offset": [ + -94.64810292225889, + 94.43701306285806 + ] + } + }, + "version": 0.4 +} \ No newline at end of file