diff --git a/README.md b/README.md index 5a043e1..b796252 100644 --- a/README.md +++ b/README.md @@ -29,10 +29,14 @@ git clone https://github.com/chengzeyi/Comfy-WaveSpeed.git # Usage -## Demo Workflow +## Demo Workflows You can find demo workflows in the `workflows` folder. +[FLUX.1-dev with First Block Cache and Compilation](./workflows/flux.json) + +[LTXV with First Block Cache and Compilation](./workflows/ltxv.json) + ## Dynamic Caching ([First Block Cache](https://github.com/chengzeyi/ParaAttention?tab=readme-ov-file#first-block-cache-our-dynamic-caching)) Inspired by TeaCache and other denoising caching algorithms, we introduce [First Block Cache (FBCache)](https://github.com/chengzeyi/ParaAttention?tab=readme-ov-file#first-block-cache-our-dynamic-caching) to use the residual output of the first transformer block as the cache indicator. @@ -42,7 +46,13 @@ This can significantly reduce the computation cost of the model, achieving a spe To use first block cache, simply add the `wavespeed->Apply First Block Cache` node to your workflow after your `Load Diffusion Model` node and adjust the `residual_diff_threashold` value to a suitable value for your model, for example: `0.07` for `flux-dev.safetensors` with `fp8_e4m3fn_fast` and 28 steps. It is expected to see a speedup of 1.5x to 3.0x with acceptable accuracy loss. -It supports many models like `FLUX`, `LTXV` and `HunyuanVideo (native)`, feel free to try it out and let us know if you have any issues! +Some configurations for different models that you can try: + +| Model | Steps | `residual_diff_threashold` | +| `flux-dev.safetensors` with `fp8_e4m3fn_fast` | 28 | 0.07 | +| `ltx-video-2b-v0.9.1.safetensors` | 30 | 0.051 | + +It supports many models like `FLUX`, `LTXV (native)` and `HunyuanVideo (native)`, feel free to try it out and let us know if you have any issues! See [Apply First Block Cache on FLUX.1-dev](https://github.com/chengzeyi/ParaAttention/blob/main/doc/fastest_flux.md#apply-first-block-cache-on-flux1-dev) for more information and detailed comparison on quality and speed. diff --git a/fbcache_nodes.py b/fbcache_nodes.py index c72d4c2..58bf909 100644 --- a/fbcache_nodes.py +++ b/fbcache_nodes.py @@ -2,7 +2,6 @@ import unittest import torch -from . import utils from . import first_block_cache @@ -56,6 +55,7 @@ def patch( residual_diff_threshold=residual_diff_threshold, cat_hidden_states_first=diffusion_model.__class__.__name__ == "HunyuanVideo", return_hidden_states_only=diffusion_model.__class__.__name__ == "LTXVModel", + clone_original_hidden_states=diffusion_model.__class__.__name__ == "LTXVModel", ) ]) dummy_single_transformer_blocks = torch.nn.ModuleList() diff --git a/first_block_cache.py b/first_block_cache.py index b95cef0..b61fa90 100644 --- a/first_block_cache.py +++ b/first_block_cache.py @@ -125,6 +125,7 @@ def __init__( return_hidden_states_first=True, cat_hidden_states_first=False, return_hidden_states_only=False, + clone_original_hidden_states=False, ): super().__init__() self.transformer_blocks = transformer_blocks @@ -133,6 +134,7 @@ def __init__( self.return_hidden_states_first = return_hidden_states_first self.cat_hidden_states_first = cat_hidden_states_first self.return_hidden_states_only = return_hidden_states_only + self.clone_original_hidden_states = clone_original_hidden_states def forward(self, img, txt=None, *args, context=None, **kwargs): if context is not None: @@ -165,6 +167,8 @@ def forward(self, img, txt=None, *args, context=None, **kwargs): (encoder_hidden_states, hidden_states)) original_hidden_states = hidden_states + if self.clone_original_hidden_states: + original_hidden_states = original_hidden_states.clone() first_transformer_block = self.transformer_blocks[0] hidden_states = first_transformer_block( hidden_states, encoder_hidden_states, *args, **kwargs) @@ -214,6 +218,9 @@ def call_remaining_transformer_blocks(self, hidden_states, **kwargs): original_hidden_states = hidden_states original_encoder_hidden_states = encoder_hidden_states + if self.clone_original_hidden_states: + original_hidden_states = original_hidden_states.clone() + original_encoder_hidden_states = original_encoder_hidden_states.clone() for block in self.transformer_blocks[1:]: hidden_states = block( hidden_states, encoder_hidden_states, *args, **kwargs) diff --git a/pyproject.toml b/pyproject.toml index 78e37cd..2f96577 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [project] name = "wavespeed" description = "The all in one inference optimization solution for ComfyUI, universal, flexible, and fast." -version = "1.0.10" +version = "1.0.11" license = {file = "LICENSE"} [project.urls] diff --git a/workflows/ltxv.json b/workflows/ltxv.json new file mode 100644 index 0000000..094ba82 --- /dev/null +++ b/workflows/ltxv.json @@ -0,0 +1,723 @@ +{ + "last_node_id": 78, + "last_link_id": 186, + "nodes": [ + { + "id": 6, + "type": "CLIPTextEncode", + "pos": [ + 420, + 190 + ], + "size": [ + 422.84503173828125, + 164.31304931640625 + ], + "flags": {}, + "order": 6, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 74 + } + ], + "outputs": [ + { + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 169 + ], + "slot_index": 0 + } + ], + "title": "CLIP Text Encode (Positive Prompt)", + "properties": { + "Node name for S&R": "CLIPTextEncode" + }, + "widgets_values": [ + "A woman with long brown hair and light skin smiles at another woman with long blonde hair. The woman with brown hair wears a black jacket and has a small, barely noticeable mole on her right cheek. The camera angle is a close-up, focused on the woman with brown hair's face. The lighting is warm and natural, likely from the setting sun, casting a soft glow on the scene. The scene appears to be real-life footage." + ], + "color": "#232", + "bgcolor": "#353" + }, + { + "id": 7, + "type": "CLIPTextEncode", + "pos": [ + 420, + 390 + ], + "size": [ + 425.27801513671875, + 180.6060791015625 + ], + "flags": {}, + "order": 7, + "mode": 0, + "inputs": [ + { + "name": "clip", + "type": "CLIP", + "link": 75 + } + ], + "outputs": [ + { + "name": "CONDITIONING", + "type": "CONDITIONING", + "links": [ + 170 + ], + "slot_index": 0 + } + ], + "title": "CLIP Text Encode (Negative Prompt)", + "properties": { + "Node name for S&R": "CLIPTextEncode" + }, + "widgets_values": [ + "low quality, worst quality, deformed, distorted, disfigured, motion smear, motion artifacts, fused fingers, bad anatomy, weird hand, ugly" + ], + "color": "#322", + "bgcolor": "#533" + }, + { + "id": 8, + "type": "VAEDecode", + "pos": [ + 1600, + 30 + ], + "size": [ + 210, + 46 + ], + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "name": "samples", + "type": "LATENT", + "link": 171 + }, + { + "name": "vae", + "type": "VAE", + "link": 87 + } + ], + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 106 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "VAEDecode" + }, + "widgets_values": [] + }, + { + "id": 38, + "type": "CLIPLoader", + "pos": [ + 60, + 190 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 3, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "CLIP", + "type": "CLIP", + "links": [ + 74, + 75 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "CLIPLoader" + }, + "widgets_values": [ + "t5xxl_fp8_e4m3fn.safetensors", + "ltxv", + "default" + ] + }, + { + "id": 41, + "type": "SaveAnimatedWEBP", + "pos": [ + 1830, + 30 + ], + "size": [ + 680, + 610 + ], + "flags": {}, + "order": 13, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 106 + } + ], + "outputs": [], + "properties": {}, + "widgets_values": [ + "ComfyUI", + 24, + false, + 90, + "default", + null + ] + }, + { + "id": 44, + "type": "CheckpointLoaderSimple", + "pos": [ + 520, + 30 + ], + "size": [ + 315, + 98 + ], + "flags": {}, + "order": 4, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "MODEL", + "type": "MODEL", + "links": [ + 183 + ], + "slot_index": 0 + }, + { + "name": "CLIP", + "type": "CLIP", + "links": null + }, + { + "name": "VAE", + "type": "VAE", + "links": [ + 87 + ], + "slot_index": 2 + } + ], + "properties": { + "Node name for S&R": "CheckpointLoaderSimple" + }, + "widgets_values": [ + "ltx-video-2b-v0.9.1.safetensors" + ] + }, + { + "id": 69, + "type": "LTXVConditioning", + "pos": [ + 920, + 60 + ], + "size": [ + 223.8660125732422, + 78 + ], + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "positive", + "type": "CONDITIONING", + "link": 169 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 170 + } + ], + "outputs": [ + { + "name": "positive", + "type": "CONDITIONING", + "links": [ + 166 + ], + "slot_index": 0 + }, + { + "name": "negative", + "type": "CONDITIONING", + "links": [ + 167 + ], + "slot_index": 1 + } + ], + "properties": { + "Node name for S&R": "LTXVConditioning" + }, + "widgets_values": [ + 25 + ] + }, + { + "id": 70, + "type": "EmptyLTXVLatentVideo", + "pos": [ + 860, + 240 + ], + "size": [ + 315, + 130 + ], + "flags": {}, + "order": 0, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "LATENT", + "type": "LATENT", + "links": [ + 168, + 175 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "EmptyLTXVLatentVideo" + }, + "widgets_values": [ + 768, + 512, + 97, + 1 + ] + }, + { + "id": 71, + "type": "LTXVScheduler", + "pos": [ + 856, + 531 + ], + "size": [ + 315, + 154 + ], + "flags": {}, + "order": 5, + "mode": 0, + "inputs": [ + { + "name": "latent", + "type": "LATENT", + "link": 168, + "shape": 7 + } + ], + "outputs": [ + { + "name": "SIGMAS", + "type": "SIGMAS", + "links": [ + 182 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "LTXVScheduler" + }, + "widgets_values": [ + 30, + 2.05, + 0.95, + true, + 0.1 + ] + }, + { + "id": 72, + "type": "SamplerCustom", + "pos": [ + 1201, + 32 + ], + "size": [ + 355.20001220703125, + 230 + ], + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "MODEL", + "link": 186 + }, + { + "name": "positive", + "type": "CONDITIONING", + "link": 166 + }, + { + "name": "negative", + "type": "CONDITIONING", + "link": 167 + }, + { + "name": "sampler", + "type": "SAMPLER", + "link": 172 + }, + { + "name": "sigmas", + "type": "SIGMAS", + "link": 182 + }, + { + "name": "latent_image", + "type": "LATENT", + "link": 175 + } + ], + "outputs": [ + { + "name": "output", + "type": "LATENT", + "links": [ + 171 + ], + "slot_index": 0 + }, + { + "name": "denoised_output", + "type": "LATENT", + "links": null + } + ], + "properties": { + "Node name for S&R": "SamplerCustom" + }, + "widgets_values": [ + true, + 0, + "fixed", + 3 + ] + }, + { + "id": 73, + "type": "KSamplerSelect", + "pos": [ + 860, + 420 + ], + "size": [ + 315, + 58 + ], + "flags": {}, + "order": 1, + "mode": 0, + "inputs": [], + "outputs": [ + { + "name": "SAMPLER", + "type": "SAMPLER", + "links": [ + 172 + ] + } + ], + "properties": { + "Node name for S&R": "KSamplerSelect" + }, + "widgets_values": [ + "euler" + ] + }, + { + "id": 76, + "type": "Note", + "pos": [ + 40, + 350 + ], + "size": [ + 360, + 200 + ], + "flags": {}, + "order": 2, + "mode": 0, + "inputs": [], + "outputs": [], + "properties": {}, + "widgets_values": [ + "This model needs long descriptive prompts, if the prompt is too short the quality will suffer greatly." + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 77, + "type": "ApplyFBCacheOnModel", + "pos": [ + 840, + -160 + ], + "size": [ + 315, + 82 + ], + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "MODEL", + "link": 183 + } + ], + "outputs": [ + { + "name": "MODEL", + "type": "MODEL", + "links": [ + 185 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "ApplyFBCacheOnModel" + }, + "widgets_values": [ + "diffusion_model", + 0.051 + ] + }, + { + "id": 78, + "type": "EnhancedCompileModel", + "pos": [ + 1200, + -370 + ], + "size": [ + 400, + 294 + ], + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "model", + "type": "*", + "link": 185 + } + ], + "outputs": [ + { + "name": "*", + "type": "*", + "links": [ + 186 + ], + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "EnhancedCompileModel" + }, + "widgets_values": [ + true, + "diffusion_model", + "torch.compile", + false, + false, + "", + "", + false, + "inductor" + ] + } + ], + "links": [ + [ + 74, + 38, + 0, + 6, + 0, + "CLIP" + ], + [ + 75, + 38, + 0, + 7, + 0, + "CLIP" + ], + [ + 87, + 44, + 2, + 8, + 1, + "VAE" + ], + [ + 106, + 8, + 0, + 41, + 0, + "IMAGE" + ], + [ + 166, + 69, + 0, + 72, + 1, + "CONDITIONING" + ], + [ + 167, + 69, + 1, + 72, + 2, + "CONDITIONING" + ], + [ + 168, + 70, + 0, + 71, + 0, + "LATENT" + ], + [ + 169, + 6, + 0, + 69, + 0, + "CONDITIONING" + ], + [ + 170, + 7, + 0, + 69, + 1, + "CONDITIONING" + ], + [ + 171, + 72, + 0, + 8, + 0, + "LATENT" + ], + [ + 172, + 73, + 0, + 72, + 3, + "SAMPLER" + ], + [ + 175, + 70, + 0, + 72, + 5, + "LATENT" + ], + [ + 182, + 71, + 0, + 72, + 4, + "SIGMAS" + ], + [ + 183, + 44, + 0, + 77, + 0, + "MODEL" + ], + [ + 185, + 77, + 0, + 78, + 0, + "*" + ], + [ + 186, + 78, + 0, + 72, + 0, + "MODEL" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.5644739300537776, + "offset": { + "0": 40.9691162109375, + "1": 495.14727783203125 + } + } + }, + "version": 0.4 +} \ No newline at end of file