diff --git a/README.md b/README.md index be32b50..eb2d651 100644 --- a/README.md +++ b/README.md @@ -458,26 +458,26 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ -| model | N/A | diffusion | F16 | true | 31.95 GiB | 17 B | 16.14 bpw | +| model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ -+-----------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+----------------+-------------------------------------------------+----------------------+ -| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | -+----------------+-------------------------------------------------+----------------------+ -| FLUX.1-dev | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1-dev VAE (F32) | -+----------------+-------------------------------------------------+----------------------+ - -+-----------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+------------+-----------------+-------------+---------------+----------------+-----------------------+-----------------------+ -| ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | +----------+------------+-----------+-----------+ -| | | | | | UMA | NONUMA | UMA | NONUMA | -+------------+-----------------+-------------+---------------+----------------+----------+------------+-----------+-----------+ -| flux_1_dev | Disabled | Unsupported | Unsupported | No | 5.13 MiB | 155.13 MiB | 31.95 GiB | 32.19 GiB | -+------------+-----------------+-------------+---------------+----------------+----------+------------+-----------+-----------+ ++-------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++----------------+-------------------------------------------------+------------------+ +| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | ++----------------+-------------------------------------------------+------------------+ +| FLUX.1 | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) | ++----------------+-------------------------------------------------+------------------+ + ++--------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++--------+-----------------+-------------+---------------+----------------+------------------------+-----------------------+ +| ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | +-----------+------------+-----------+-----------+ +| | | | | | UMA | NONUMA | UMA | NONUMA | ++--------+-----------------+-------------+---------------+----------------+-----------+------------+-----------+-----------+ +| flux_1 | Disabled | Unsupported | Unsupported | Yes | 87.45 MiB | 237.45 MiB | 31.79 GiB | 41.06 GiB | ++--------+-----------------+-------------+---------------+----------------+-----------+------------+-----------+-----------+ $ # Parse FLUX.1-dev Model without offload Conditioner and Autoencoder $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gguf" --clip-on-cpu --vae-on-cpu @@ -486,26 +486,53 @@ $ gguf-parser --hf-repo="gpustack/FLUX.1-dev-GGUF" --hf-file="FLUX.1-dev-FP16.gg +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ | TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ -| model | N/A | diffusion | F16 | true | 31.95 GiB | 17 B | 16.14 bpw | +| model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | ++-------+------+-----------+--------------+---------------+-----------+------------+-----------+ + ++-------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++----------------+-------------------------------------------------+------------------+ +| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | ++----------------+-------------------------------------------------+------------------+ +| FLUX.1 | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) | ++----------------+-------------------------------------------------+------------------+ + ++-------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++--------+-----------------+-------------+---------------+----------------+-----------------------+-----------------------+ +| ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | +-----------+-----------+-----------+-----------+ +| | | | | | UMA | NONUMA | UMA | NONUMA | ++--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+ +| flux_1 | Disabled | Unsupported | Unsupported | Yes | 16.09 GiB | 16.24 GiB | 22.29 GiB | 25.05 GiB | ++--------+-----------------+-------------+---------------+----------------+-----------+-----------+-----------+-----------+ + +$ # Parse FLUX.1-dev Model with Autoencoder tiling ++----------------------------------------------------------------------------------------------+ +| METADATA | ++-------+------+-----------+--------------+---------------+-----------+------------+-----------+ +| TYPE | NAME | ARCH | QUANTIZATION | LITTLE ENDIAN | SIZE | PARAMETERS | BPW | ++-------+------+-----------+--------------+---------------+-----------+------------+-----------+ +| model | N/A | diffusion | F16 | true | 31.79 GiB | 17 B | 16.06 bpw | +-------+------+-----------+--------------+---------------+-----------+------------+-----------+ -+-----------------------------------------------------------------------------------------+ -| ARCHITECTURE | -+----------------+-------------------------------------------------+----------------------+ -| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | -+----------------+-------------------------------------------------+----------------------+ -| FLUX.1-dev | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1-dev VAE (F32) | -+----------------+-------------------------------------------------+----------------------+ - -+---------------------------------------------------------------------------------------------------------------------------+ -| ESTIMATE | -+------------+-----------------+-------------+---------------+----------------+---------------------+-----------------------+ -| ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 | -| | | | | +----------+----------+-----------+-----------+ -| | | | | | UMA | NONUMA | UMA | NONUMA | -+------------+-----------------+-------------+---------------+----------------+----------+----------+-----------+-----------+ -| flux_1_dev | Disabled | Unsupported | Unsupported | No | 9.66 GiB | 9.81 GiB | 22.29 GiB | 22.54 GiB | -+------------+-----------------+-------------+---------------+----------------+----------+----------+-----------+-----------+ ++-------------------------------------------------------------------------------------+ +| ARCHITECTURE | ++----------------+-------------------------------------------------+------------------+ +| DIFFUSION ARCH | CONDITIONERS | AUTOENCODER | ++----------------+-------------------------------------------------+------------------+ +| FLUX.1 | OpenAI CLIP ViT-L/14 (F16), Google T5-xxl (F16) | FLUX.1 VAE (F16) | ++----------------+-------------------------------------------------+------------------+ + ++--------------------------------------------------------------------------------------------------------------------------+ +| ESTIMATE | ++--------+-----------------+-------------+---------------+----------------+------------------------+-----------------------+ +| ARCH | FLASH ATTENTION | MMAP LOAD | DISTRIBUTABLE | FULL OFFLOADED | RAM | VRAM 0 | +| | | | | +-----------+------------+-----------+-----------+ +| | | | | | UMA | NONUMA | UMA | NONUMA | ++--------+-----------------+-------------+---------------+----------------+-----------+------------+-----------+-----------+ +| flux_1 | Disabled | Unsupported | Unsupported | Yes | 87.45 MiB | 237.45 MiB | 31.79 GiB | 36.18 GiB | ++--------+-----------------+-------------+---------------+----------------+-----------+------------+-----------+-----------+ ``` diff --git a/cmd/gguf-parser/README.md b/cmd/gguf-parser/README.md index d31f50b..f8a8507 100644 --- a/cmd/gguf-parser/README.md +++ b/cmd/gguf-parser/README.md @@ -27,13 +27,13 @@ GLOBAL OPTIONS: Estimate - --device-metric value [ --device-metric value ] Specify the device metrics, which is used to estimate the throughput, in form of "FLOPS;Up Bandwidth[;Down Bandwidth]". The FLOPS unit, select from [PFLOPS, TFLOPS, GFLOPS, MFLOPS, KFLOPS]. The Up/Down Bandwidth unit, select from [PiBps, TiBps, GiBps, MiBps, KiBps, PBps, TBps, GBps, MBps, KBps, Pbps, Tbps, Gbps, Mbps, Kbps]. Up Bandwidth usually indicates the bandwidth to transmit the data to calculate, and Down Bandwidth indicates the bandwidth to transmit the calculated result to next layer. For example, "--device-metric 10TFLOPS;400GBps" means the device has 10 TFLOPS and 400 GBps Up/Down bandwidth, "--device-metric 10TFLOPS;400GBps;5000MBps" means the device has 5000MBps Down bandwidth. If the quantity specified by "--device-metric" is less than the number of estimation devices(determined by "--tensor-split" and "--rpc" to infer the device count), then replicate the last "--device-metric" to meet the required number of evaluation devices. - --flash-attention, --flash-attn, --fa Specify enabling Flash Attention, which is used to estimate the usage. Flash Attention can reduce the usage of RAM/VRAM. (default: false) - --main-gpu value, --mg value Specify the GPU to use for the model (with "--split-mode=none") or for intermediate results and KV (with "--split-mode=row"), which is used to estimate the usage. Since gguf-parser cannot recognize the host GPU devices or RPC servers, "--main-gpu" only works when "--tensor-split" is set. (default: 0) - --parallel-size value, --parallel value, --np value Specify the number of parallel sequences to decode, which is used to estimate the usage. (default: 1) - --platform-footprint value Specify the platform footprint(RAM,VRAM) of running host in MiB, which is used to estimate the NonUMA usage, default is "150,250". Different platform always gets different RAM and VRAM footprints, for example, within CUDA, "cudaMemGetInfo" would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: "150,250") - --rpc value Specify the RPC servers, which is used to estimate the usage, it is a comma-separated list of host:port. Woks with "--tensor-split". - --tensor-split value, --ts value Specify the fraction of the model to offload to each device, which is used to estimate the usage, it is a comma-separated list of integer. Since gguf-parser cannot recognize the host GPU devices or RPC servers, must explicitly set "--tensor-split" to indicate how many devices are used. To declare the devices belong to RPC servers, set "--rpc" please. + --device-metric value [ --device-metric value ] Specify the device metrics, which is used to estimate the throughput, in form of "FLOPS;Up Bandwidth[;Down Bandwidth]". The FLOPS unit, select from [PFLOPS, TFLOPS, GFLOPS, MFLOPS, KFLOPS]. The Up/Down Bandwidth unit, select from [PiBps, TiBps, GiBps, MiBps, KiBps, PBps, TBps, GBps, MBps, KBps, Pbps, Tbps, Gbps, Mbps, Kbps]. Up Bandwidth usually indicates the bandwidth to transmit the data to calculate, and Down Bandwidth indicates the bandwidth to transmit the calculated result to next layer. For example, "--device-metric 10TFLOPS;400GBps" means the device has 10 TFLOPS and 400 GBps Up/Down bandwidth, "--device-metric 10TFLOPS;400GBps;5000MBps" means the device has 5000MBps Down bandwidth. If the quantity specified by "--device-metric" is less than the number of estimation devices(determined by "--tensor-split" and "--rpc" to infer the device count), then replicate the last "--device-metric" to meet the required number of evaluation devices. + --flash-attention, --flash-attn, --fa, --diffusion-fa Specify enabling Flash Attention, which is used to estimate the usage. Flash Attention can reduce the usage of RAM/VRAM. (default: false) + --main-gpu value, --mg value Specify the GPU to use for the model (with "--split-mode=none") or for intermediate results and KV (with "--split-mode=row"), which is used to estimate the usage. Since gguf-parser cannot recognize the host GPU devices or RPC servers, "--main-gpu" only works when "--tensor-split" is set. (default: 0) + --parallel-size value, --parallel value, --np value Specify the number of parallel sequences to decode, which is used to estimate the usage. (default: 1) + --platform-footprint value Specify the platform footprint(RAM,VRAM) of running host in MiB, which is used to estimate the NonUMA usage, default is "150,250". Different platform always gets different RAM and VRAM footprints, for example, within CUDA, "cudaMemGetInfo" would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: "150,250") + --rpc value Specify the RPC servers, which is used to estimate the usage, it is a comma-separated list of host:port. Woks with "--tensor-split". + --tensor-split value, --ts value Specify the fraction of the model to offload to each device, which is used to estimate the usage, it is a comma-separated list of integer. Since gguf-parser cannot recognize the host GPU devices or RPC servers, must explicitly set "--tensor-split" to indicate how many devices are used. To declare the devices belong to RPC servers, set "--rpc" please. Estimate/LLaMACpp @@ -54,10 +54,12 @@ GLOBAL OPTIONS: --image-autoencoder-tiling, --vae-tiling, --image-vae-tiling Specify to enable tiling for the vae model. (default: false) --image-batch-count value, --batch-count value, --image-max-batch value Specify the batch(generation) count of the image. (default: 1) - --image-height value, --height value, --image-max-height value Specify the (maximum) height of the image. (default: 512) + --image-free-compute-memory-immediately Specify to free the compute memory immediately after the generation, which burst using VRAM. (default: false) + --image-height value, --height value, --image-max-height value Specify the (maximum) height of the image. (default: 1024) --image-no-autoencoder-offload, --vae-on-cpu, --image-no-vae-model-offload Specify to offload the vae model to CPU. (default: false) + --image-no-autoencoder-tiling, --image-no-vae-tiling Specify to disable tiling for the vae model, it takes precedence over --image-autoencoder-tiling. (default: false) --image-no-conditioner-offload, --clip-on-cpu, --image-no-text-encoder-model-offload Specify to offload the text encoder model to CPU. (default: false) - --image-width value, --width value, --image-max-width value Specify the (maximum) width of the image. (default: 512) + --image-width value, --width value, --image-max-width value Specify the (maximum) width of the image. (default: 1024) Load diff --git a/cmd/gguf-parser/main.go b/cmd/gguf-parser/main.go index 7865230..0ba854e 100644 --- a/cmd/gguf-parser/main.go +++ b/cmd/gguf-parser/main.go @@ -469,7 +469,11 @@ func main() { Value: flashAttention, Category: "Estimate", Name: "flash-attention", - Aliases: []string{"flash-attn", "fa"}, + Aliases: []string{ + "flash-attn", + "fa", + "diffusion-fa", // StableDiffusionCpp compatibility + }, Usage: "Specify enabling Flash Attention, " + "which is used to estimate the usage. " + "Flash Attention can reduce the usage of RAM/VRAM.", @@ -715,6 +719,23 @@ func main() { }, Usage: "Specify to enable tiling for the vae model.", }, + &cli.BoolFlag{ + Destination: &sdcNoAutoencoderTiling, + Value: sdcNoAutoencoderTiling, + Category: "Estimate/StableDiffusionCpp", + Name: "image-no-autoencoder-tiling", + Aliases: []string{ + "image-no-vae-tiling", // LLaMABox compatibility + }, + Usage: "Specify to disable tiling for the vae model, it takes precedence over --image-autoencoder-tiling.", + }, + &cli.BoolFlag{ + Destination: &sdcFreeComputeMemoryImmediately, + Value: sdcFreeComputeMemoryImmediately, + Category: "Estimate/StableDiffusionCpp", + Name: "image-free-compute-memory-immediately", // LLaMABox compatibility + Usage: "Specify to free the compute memory immediately after the generation, which burst using VRAM.", + }, &cli.BoolFlag{ Destination: &raw, Value: raw, @@ -872,12 +893,14 @@ var ( lmcOffloadLayersDraft = -1 lmcOffloadLayersStep uint64 // estimate options for stable-diffusion.cpp - sdcBatchCount = 1 - sdcHeight = 512 - sdcWidth = 512 - sdcNoConditionerOffload bool - sdcNoAutoencoderOffload bool - sdcAutoencoderTiling bool + sdcBatchCount = 1 + sdcHeight = 1024 + sdcWidth = 1024 + sdcNoConditionerOffload bool + sdcNoAutoencoderOffload bool + sdcAutoencoderTiling bool + sdcNoAutoencoderTiling bool + sdcFreeComputeMemoryImmediately bool // output options raw bool rawOutput string @@ -1049,9 +1072,12 @@ func mainAction(c *cli.Context) error { if sdcNoAutoencoderOffload { eopts = append(eopts, WithoutStableDiffusionCppOffloadAutoencoder()) } - if sdcAutoencoderTiling { + if sdcAutoencoderTiling && !sdcNoAutoencoderTiling { eopts = append(eopts, WithStableDiffusionCppAutoencoderTiling()) } + if sdcFreeComputeMemoryImmediately { + eopts = append(eopts, WithStableDiffusionCppFreeComputeMemoryImmediately()) + } // Parse GGUF file. diff --git a/file_architecture.go b/file_architecture.go index cd43118..0f84c6c 100644 --- a/file_architecture.go +++ b/file_architecture.go @@ -247,16 +247,9 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { sdKey = "model.diffusion_model.output_blocks.11.1.transformer_blocks.0.attn2.to_v.weight" // SD 1.x/2.x sdXlKey = "model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" // SD XL sdXlRefinerKey = "model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" // SD XL Refiner - sd3MediumKey = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight" // SD 3 Medium - sd3_5MediumKey = "model.diffusion_model.joint_blocks.23.x_block.attn.ln_k.weight" // SD 3.5 Medium - sd3_5LargeKey = "model.diffusion_model.joint_blocks.37.x_block.attn.ln_k.weight" // SD 3.5 Large - - fluxKey = "model.diffusion_model.double_blocks.7.txt_attn.proj.weight" - fluxKey2 = "double_blocks.7.txt_attn.proj.weight" - fluxDevAndLiteKey = "model.diffusion_model.guidance_in.in_layer.weight" // FLUX.1-dev / FLUX.1-lite - fluxDevAndLiteKey2 = "guidance_in.in_layer.weight" - fluxDevAndSchnellKey = "model.diffusion_model.double_blocks.18.txt_attn.proj.weight" // FLUX.1-dev / FLUX.1-schnell - fluxDevAndSchnellKey2 = "double_blocks.18.txt_attn.proj.weight" + sd3Key = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight" // SD 3.x + fluxKey = "model.diffusion_model.double_blocks.0.txt_attn.proj.weight" // FLUX.1 + fluxKey2 = "double_blocks.0.txt_attn.proj.weight" // Conditioner @@ -271,15 +264,9 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { sdKey, sdXlKey, sdXlRefinerKey, - sd3MediumKey, - sd3_5MediumKey, - sd3_5LargeKey, + sd3Key, fluxKey, fluxKey2, - fluxDevAndLiteKey, - fluxDevAndLiteKey2, - fluxDevAndSchnellKey, - fluxDevAndSchnellKey2, openAiClipVitL14Key, openClipVitH14Key, @@ -301,37 +288,15 @@ func (gf *GGUFFile) diffuserArchitecture() (ga GGUFArchitecture) { if _, ok = tis[sdXlRefinerKey]; ok { ga.DiffusionArchitecture = "Stable Diffusion XL Refiner" } - } else if _, ok := tis[sd3MediumKey]; ok { - ga.DiffusionArchitecture = "Stable Diffusion 3 Medium" - if _, ok = tis[sd3_5MediumKey]; ok { - ga.DiffusionArchitecture = "Stable Diffusion 3.5 Medium" - if _, ok = tis[sd3_5LargeKey]; ok { - ga.DiffusionArchitecture = "Stable Diffusion 3.5 Large" - } - } + } else if _, ok := tis[sd3Key]; ok { + ga.DiffusionArchitecture = "Stable Diffusion 3.x" ga.DiffusionTransformer = true } if _, ok := tis[fluxKey]; ok { - if _, ok = tis[fluxDevAndLiteKey]; ok { - if _, ok = tis[fluxDevAndSchnellKey]; ok { - ga.DiffusionArchitecture = "FLUX.1-dev" - } else { - ga.DiffusionArchitecture = "FLUX.1-lite" - } - } else { - ga.DiffusionArchitecture = "FLUX.1-schnell" - } + ga.DiffusionArchitecture = "FLUX.1" ga.DiffusionTransformer = true } else if _, ok := tis[fluxKey2]; ok { - if _, ok = tis[fluxDevAndLiteKey2]; ok { - if _, ok = tis[fluxDevAndSchnellKey2]; ok { - ga.DiffusionArchitecture = "FLUX.1-dev" - } else { - ga.DiffusionArchitecture = "FLUX.1-lite" - } - } else { - ga.DiffusionArchitecture = "FLUX.1-schnell" - } + ga.DiffusionArchitecture = "FLUX.1" ga.DiffusionTransformer = true } diff --git a/file_estimate__stablediffusioncpp.go b/file_estimate__stablediffusioncpp.go index b453015..441045c 100644 --- a/file_estimate__stablediffusioncpp.go +++ b/file_estimate__stablediffusioncpp.go @@ -2,10 +2,12 @@ package gguf_parser import ( "regexp" + "strings" + + "golang.org/x/exp/maps" "github.com/gpustack/gguf-parser-go/util/ptr" "github.com/gpustack/gguf-parser-go/util/stringx" - "strings" ) // Types for StableDiffusionCpp estimation. @@ -91,10 +93,10 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) o.SDCBatchCount = ptr.To[int32](1) } if o.SDCHeight == nil { - o.SDCHeight = ptr.To[uint32](512) + o.SDCHeight = ptr.To[uint32](1024) } if o.SDCWidth == nil { - o.SDCWidth = ptr.To[uint32](512) + o.SDCWidth = ptr.To[uint32](1024) } if o.SDCOffloadConditioner == nil { o.SDCOffloadConditioner = ptr.To(true) @@ -105,6 +107,9 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) if o.SDCAutoencoderTiling == nil { o.SDCAutoencoderTiling = ptr.To(false) } + if o.SDCFreeComputeMemoryImmediately == nil { + o.SDCFreeComputeMemoryImmediately = ptr.To(false) + } // Devices. e.Devices = make([]StableDiffusionCppRunDeviceUsage, len(o.TensorSplitFraction)+1) @@ -166,13 +171,9 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) { // Bootstrap. e.Devices[0].Footprint = GGUFBytesScalar(10*1024*1024) /* model load */ + (gf.Size - gf.ModelSize) /* metadata */ - - // Output buffer, - // see - // TODO: Implement this. } - var cdLs, aeLs, mdLs GGUFLayerTensorInfos + var cdLs, aeLs, dmLs GGUFLayerTensorInfos { var tis GGUFTensorInfos tis = gf.TensorInfos.Search(regexp.MustCompile(`^cond_stage_model\..*`)) @@ -188,13 +189,13 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) } tis = gf.TensorInfos.Search(regexp.MustCompile(`^model\.diffusion_model\..*`)) if len(tis) != 0 { - mdLs = tis.Layers() + dmLs = tis.Layers() } else { - mdLs = gf.TensorInfos.Layers() + dmLs = gf.TensorInfos.Layers() } } - var cdDevIdx, aeDevIdx, mdDevIdx int + var cdDevIdx, aeDevIdx, dmDevIdx int { if *o.SDCOffloadConditioner { cdDevIdx = 1 @@ -202,7 +203,7 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) if *o.SDCOffloadAutoencoder { aeDevIdx = 1 } - mdDevIdx = 1 + dmDevIdx = 1 } // Weight & Parameter. @@ -220,10 +221,8 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) } // Model. - if mdLs != nil { - e.Devices[mdDevIdx].Weight = GGUFBytesScalar(mdLs.Bytes()) - e.Devices[mdDevIdx].Parameter = GGUFParametersScalar(mdLs.Elements()) - } + e.Devices[dmDevIdx].Weight = GGUFBytesScalar(dmLs.Bytes()) + e.Devices[dmDevIdx].Parameter = GGUFParametersScalar(dmLs.Elements()) } // Computation. @@ -237,71 +236,118 @@ func (gf *GGUFFile) EstimateStableDiffusionCppRun(opts ...GGUFRunEstimateOption) // Work context, // see https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1467-L1481, // https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1572-L1586, - // https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1675-L1679, - // https://github.com/thxCode/stable-diffusion.cpp/blob/78629d6340f763a8fe14372e0ba3ace73526a265/stable-diffusion.cpp#L2185-L2189, - // https://github.com/thxCode/stable-diffusion.cpp/blob/78629d6340f763a8fe14372e0ba3ace73526a265/stable-diffusion.cpp#L2270-L2274. + // https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1675-L1679. // { - var wcSize uint32 = 50 * 1024 * 1024 - wcSize += *o.SDCWidth * *o.SDCHeight * 3 * 4 /* sizeof(float) */ * 2 // RGB - e.Devices[0].Computation += GGUFBytesScalar(wcSize * uint32(ptr.Deref(o.ParallelSize, 1))) + usage := uint64(50 * 1024 * 1024) + usage += uint64(*o.SDCWidth) * uint64(*o.SDCHeight) * 3 /* output channels */ * 4 /* sizeof(float) */ * 2 /* include img2img*/ + e.Devices[0].Computation += GGUFBytesScalar(usage * uint64(ptr.Deref(o.ParallelSize, 1)) /* max batch */) } - // Conditioner learned conditions, + // Encode usage, // see https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L388-L391, // https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L758-L766, // https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/conditioner.hpp#L1083-L1085. - switch { - case strings.HasPrefix(a.DiffusionArchitecture, "FLUX"): - for i := range cdLs { - ds := []uint64{1} - switch i { - case 0: - ds = []uint64{768, 77} - case 1: - ds = []uint64{4096, 256} + { + var tes [][]uint64 + switch { + case strings.HasPrefix(a.DiffusionArchitecture, "FLUX"): // FLUX.1 + tes = [][]uint64{ + {768, 77}, + {4096, 256}, } - cds := GGUFBytesScalar(GGMLTypeF32.RowSizeOf(ds)) * 2 // include unconditioner - e.Conditioners[i].Devices[cdDevIdx].Computation += cds - } - case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 3"): - for i := range cdLs { - ds := []uint64{1} - switch i { - case 0: - ds = []uint64{768, 77} - case 1: - ds = []uint64{1280, 77} - case 2: - ds = []uint64{4096, 77} + case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 3"): // SD 3.x + tes = [][]uint64{ + {768, 77}, + {1280, 77}, + {4096, 77}, } - cds := GGUFBytesScalar(GGMLTypeF32.RowSizeOf(ds)) * 2 // include unconditioner - e.Conditioners[i].Devices[cdDevIdx].Computation += cds - } - default: - for i := range cdLs { - ds := []uint64{1} - switch i { - case 0: - ds = []uint64{768, 77} - if strings.HasSuffix(a.DiffusionArchitecture, "Refiner") { - ds = []uint64{1280, 77} + case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion XL"): // SD XL/XL Refiner + if strings.HasSuffix(a.DiffusionArchitecture, "Refiner") { + tes = [][]uint64{ + {1280, 77}, + } + } else { + tes = [][]uint64{ + {768, 77}, + {1280, 77}, } - case 1: - ds = []uint64{1280, 77} } - cds := GGUFBytesScalar(GGMLTypeF32.RowSizeOf(ds)) * 2 // include unconditioner - e.Conditioners[i].Devices[cdDevIdx].Computation += cds + default: // SD 1.x/2.x + tes = [][]uint64{ + {768, 77}, + } + } + for i := range cdLs { + usage := GGMLTypeF32.RowSizeOf(tes[i]) * 2 /* include conditioner */ + e.Conditioners[i].Devices[cdDevIdx].Computation += GGUFBytesScalar(usage) } + + // TODO VAE Encode } - // Diffusion nosier, - // see https://github.com/leejet/stable-diffusion.cpp/blob/4570715727f35e5a07a76796d823824c8f42206c/stable-diffusion.cpp#L1361. - { - mds := GGUFBytesScalar(GGMLTypeF32.RowSizeOf([]uint64{uint64(*o.SDCWidth / 8), uint64(*o.SDCHeight / 8), 16, 1})) - e.Devices[mdDevIdx].Computation += mds + // Diffusing usage. + if !*o.SDCFreeComputeMemoryImmediately { + var usage uint64 + switch { + case strings.HasPrefix(a.DiffusionArchitecture, "FLUX"): // FLUX.1 + usage = GuessFLUXDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) + case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 3"): // SD 3.x + const ( + sd3MediumKey = "model.diffusion_model.joint_blocks.23.x_block.attn.proj.weight" // SD 3 Medium + sd35MediumKey = "model.diffusion_model.joint_blocks.23.x_block.attn.ln_k.weight" // SD 3.5 Medium + sd35LargeKey = "model.diffusion_model.joint_blocks.37.x_block.attn.ln_k.weight" // SD 3.5 Large + ) + m, _ := dmLs.Index([]string{sd3MediumKey, sd35MediumKey, sd35LargeKey}) + switch { + case m[sd35LargeKey].Name != "": + usage = GuessSD35LargeDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) + case m[sd35MediumKey].Name != "": + usage = GuessSD35MediumDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) + default: + usage = GuessSD3MediumDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) + } + case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion XL"): // SD XL/XL Refiner + const ( + sdXlKey = "model.diffusion_model.output_blocks.5.1.transformer_blocks.1.attn1.to_v.weight" // SD XL + sdXlRefinerKey = "model.diffusion_model.output_blocks.8.1.transformer_blocks.1.attn1.to_v.weight" // SD XL Refiner + ) + m, _ := dmLs.Index([]string{sdXlKey, sdXlRefinerKey}) + if m[sdXlRefinerKey].Name != "" { + usage = GuessSDXLRefinerDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) + } else { + usage = GuessSDXLDiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) + } + case strings.HasPrefix(a.DiffusionArchitecture, "Stable Diffusion 2"): // SD 2.x + usage = GuessSD2DiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) + default: // SD 1.x + usage = GuessSD1DiffusionModelMemoryUsage(*o.SDCWidth, *o.SDCHeight, e.FlashAttention) + } + e.Devices[dmDevIdx].Computation += GGUFBytesScalar(usage) } + // Decode usage. + if aeLs != nil && !*o.SDCFreeComputeMemoryImmediately { + var convDim uint64 + { + m, _ := aeLs.Index([]string{ + "first_stage_model.decoder.conv_in.weight", + "decoder.conv_in.weight", + }) + tis := maps.Values(m) + if len(tis) != 0 && tis[0].NDimensions > 3 { + convDim = max(tis[0].Dimensions[0], tis[0].Dimensions[3]) + } + } + + var usage uint64 + if !*o.SDCAutoencoderTiling { + usage = uint64(*o.SDCWidth) * uint64(*o.SDCHeight) * (3 /* output channels */ *4 /* sizeof(float) */ + 1) * convDim + } else { + usage = 512 * 512 * (3 /* output channels */ *4 /* sizeof(float) */ + 1) * convDim + } + e.Autoencoder.Devices[aeDevIdx].Computation += GGUFBytesScalar(usage) + } } return e @@ -396,10 +442,10 @@ func (e StableDiffusionCppRunEstimate) SummarizeItem( cp := d.Computation // UMA. - emi.VRAMs[i].UMA = fp + wg + cp + emi.VRAMs[i].UMA = fp + wg + /* cp */ 0 // NonUMA. - emi.VRAMs[i].NonUMA = GGUFBytesScalar(nonUMAVramFootprint) + emi.VRAMs[i].UMA + emi.VRAMs[i].NonUMA = GGUFBytesScalar(nonUMAVramFootprint) + fp + wg + cp } } diff --git a/file_estimate_option.go b/file_estimate_option.go index 7afffc2..047e74b 100644 --- a/file_estimate_option.go +++ b/file_estimate_option.go @@ -31,14 +31,15 @@ type ( LMCAdapters []LLaMACppRunEstimate // StableDiffusionCpp (SDC) specific - SDCBatchCount *int32 - SDCHeight *uint32 - SDCWidth *uint32 - SDCOffloadConditioner *bool - SDCOffloadAutoencoder *bool - SDCAutoencoderTiling *bool - SDCUpscaler *StableDiffusionCppRunEstimate - SDCControlNet *StableDiffusionCppRunEstimate + SDCBatchCount *int32 + SDCHeight *uint32 + SDCWidth *uint32 + SDCOffloadConditioner *bool + SDCOffloadAutoencoder *bool + SDCAutoencoderTiling *bool + SDCFreeComputeMemoryImmediately *bool + SDCUpscaler *StableDiffusionCppRunEstimate + SDCControlNet *StableDiffusionCppRunEstimate } // GGUFRunDeviceMetric holds the device metric for the estimate. @@ -318,6 +319,13 @@ func WithStableDiffusionCppAutoencoderTiling() GGUFRunEstimateOption { } } +// WithStableDiffusionCppFreeComputeMemoryImmediately enables freeing compute memory immediately. +func WithStableDiffusionCppFreeComputeMemoryImmediately() GGUFRunEstimateOption { + return func(o *_GGUFRunEstimateOptions) { + o.SDCFreeComputeMemoryImmediately = ptr.To(true) + } +} + // WithStableDiffusionCppUpscaler sets the upscaler estimate usage. func WithStableDiffusionCppUpscaler(ups *StableDiffusionCppRunEstimate) GGUFRunEstimateOption { return func(o *_GGUFRunEstimateOptions) { diff --git a/file_metadata.go b/file_metadata.go index 19d8695..528ff98 100644 --- a/file_metadata.go +++ b/file_metadata.go @@ -315,9 +315,13 @@ func GetFileType(cm map[GGMLType]int) GGUFFileType { }) // Guess. + if ts[0] == GGMLTypeF32 { + if len(ts) == 1 { + return GGUFFileTypeAllF32 + } + ts[0] = ts[1] + } switch ts[0] { - case GGMLTypeF32: - return GGUFFileTypeAllF32 case GGMLTypeF16: return GGUFFileTypeMostlyF16 case GGMLTypeQ4_0: diff --git a/gen.go b/gen.go index 6dfd072..6f8ff36 100644 --- a/gen.go +++ b/gen.go @@ -1,2 +1,3 @@ -//go:generate go generate -tags tools gen.stringer.go +//go:generate go generate -tags stringer gen.stringer.go +//go:generate go generate -tags regression gen.regression.go package gguf_parser diff --git a/gen.regression.go b/gen.regression.go new file mode 100644 index 0000000..483d713 --- /dev/null +++ b/gen.regression.go @@ -0,0 +1,491 @@ +//go:build regression + +//go:generate go run -tags regression gen.regression.go +package main + +import ( + "fmt" + "strconv" + "math" + "os" + "text/template" + "bytes" + "go/format" + + "gonum.org/v1/gonum/mat" +) + +type LinearRegression struct { + Intercept float64 + Slope float64 +} + +func (lr *LinearRegression) Fit(xs, ys []float64) { + if len(xs) != len(ys) { + panic("length of xs and ys must be the same") + } + + var sX, sY, sXY, sXX float64 + for i := 0; i < len(xs); i++ { + sX += xs[i] + sY += ys[i] + sXY += xs[i] * ys[i] + sXX += xs[i] * xs[i] + } + + n := float64(len(xs)) + d := n*sXX - sX*sX + if d == 0 { + d = 1 + } + + lr.Slope = (n*sXY - sX*sY) / d + lr.Intercept = (sY*sXX - sX*sXY) / d +} + +func (lr *LinearRegression) Predict(x float64) (y float64) { + return lr.Intercept + lr.Slope*x +} + +type PolynomialRegression struct { + Degree int + Coefficients []float64 +} + +func (pr *PolynomialRegression) Fit(xs, ys []float64) { + samples := len(xs) + feats := pr.Degree + 1 + + feat := mat.NewDense(samples, feats, nil) + { + for i := 0; i < samples; i++ { + for j := 0; j < feats; j++ { + feat.Set(i, j, math.Pow(xs[i], float64(j))) + } + } + var qr mat.QR + qr.Factorize(feat) + } + yVec := mat.NewVecDense(samples, ys) + + var coef mat.VecDense + if err := coef.SolveVec(feat, yVec); err != nil { + panic("failed to solve") + } + + pr.Coefficients = coef.RawVector().Data +} + +func (pr *PolynomialRegression) Predict(x float64) (y float64) { + y = 0 + for i := 0; i < pr.Degree+1; i++ { + y += pr.Coefficients[i] * math.Pow(x, float64(i)) + } + return +} + +func DiffusionModelMemoryUsageRegression(output string) { + type Regression struct { + Name string + LinearRegression *LinearRegression + PolynomialRegression *PolynomialRegression + } + + const tmplStr = ` +package gguf_parser + +import "math" + +{{ range . -}} +// {{ .Name }} returns the memory usage in bytes for the given width and height, +// which is calculated by linear regression or polynomial regression. +func {{ .Name }}(width, height uint32, flashAttention bool) uint64 { + coefficients := []float64{ {{ range .PolynomialRegression.Coefficients }}{{ . }}, {{ end }} } + degree := {{ .PolynomialRegression.Degree }} + x := float64(width * height) + + {{ if .LinearRegression -}} + if flashAttention { + coefficients = []float64{ {{ .LinearRegression.Intercept }}, {{ .LinearRegression.Slope }} } + degree = 1 + } + {{- end }} + + y := float64(0) + for i := 0; i <= degree; i++ { + y += coefficients[i] * math.Pow(x, float64(i)) + } + return uint64(y) +} + +{{ end }} + +` + ts := []struct { + n string + x2y map[float64]float64 + c map[float64]float64 + fax2y map[float64]float64 + fac map[float64]float64 + }{ + { + n: "GuessSD1DiffusionModelMemoryUsage", + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 49.57 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 559.90 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 8360.93 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 18681.62 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 25377.96 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 41842.65 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 77333.77 MB(VRAM) // 1792*1792 + x2y: map[float64]float64{ + 256 * 256: 49.57, + 512 * 512: 559.90, + 1024 * 1024: 8360.93, + 1024 * 1536: 18681.62, + 1024 * 1792: 25377.96, + 1536 * 1536: 41842.65, + 1792 * 1792: 77333.77, + }, + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 56879.17 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 100924.37 MB(VRAM) // 1792*2048 + c: map[float64]float64{ + 1536 * 1792: 56879.17, + 1792 * 2048: 100924.37, + }, + }, + { + n: "GuessSD2DiffusionModelMemoryUsage", + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 37.65 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 367.98 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 830.86 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 11769.69 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 15970.04 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 26290.73 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 48521.84 MB(VRAM) // 1792*1792 + x2y: map[float64]float64{ + 256 * 256: 37.65, + 512 * 512: 367.98, + 1024 * 1024: 830.86, + 1024 * 1536: 11769.69, + 1024 * 1792: 15970.04, + 1536 * 1536: 26290.73, + 1792 * 1792: 48521.84, + }, + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 35711.24 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 63292.44 MB(VRAM) // 1792*2048 + c: map[float64]float64{ + 1536 * 1792: 35711.24, + 1792 * 2048: 63292.44, + }, + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 34.52 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 130.48 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 519.01 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 774.69 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 902.54 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1158.23 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1573.72 MB(VRAM) // 1792*1792 + fax2y: map[float64]float64{ + 256 * 256: 34.52, + 512 * 512: 130.48, + 1024 * 1024: 519.01, + 1024 * 1536: 774.69, + 1024 * 1792: 902.54, + 1536 * 1536: 1158.23, + 1792 * 1792: 1573.72, + }, + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1349.99 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1797.44 MB(VRAM) // 1792*2048 + fac: map[float64]float64{ + 1536 * 1792: 1349.99, + 1792 * 2048: 1797.44, + }, + }, + { + n: "GuessSDXLDiffusionModelMemoryUsage", + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 60.76 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 132.05 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 830.86 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1701.55 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2256.90 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 3607.58 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 6484.95 MB(VRAM) // 1792*1792 + x2y: map[float64]float64{ + 256 * 256: 60.76, + 512 * 512: 132.05, + 1024 * 1024: 830.86, + 1024 * 1536: 1701.55, + 1024 * 1792: 2256.90, + 1536 * 1536: 3607.58, + 1792 * 1792: 6484.95, + }, + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 4830.60 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 8384.30 MB(VRAM) // 1792*2048 + c: map[float64]float64{ + 1536 * 1792: 4830.60, + 1792 * 2048: 8384.30, + }, + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 60.13 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 132.05 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 440.86 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 726.55 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 874.40 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1110.08 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1554.33 MB(VRAM) // 1792*1792 + fax2y: map[float64]float64{ + 256 * 256: 60.13, + 512 * 512: 132.05, + 1024 * 1024: 440.86, + 1024 * 1536: 726.55, + 1024 * 1792: 874.40, + 1536 * 1536: 1110.08, + 1792 * 1792: 1554.33, + }, + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1339.35 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1769.30 MB(VRAM) // 1792*2048 + fac: map[float64]float64{ + 1536 * 1792: 1339.35, + 1792 * 2048: 1769.30, + }, + }, + { + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 44.57 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 154.40 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 968.43 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2013.12 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2679.46 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 4300.15 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 7752.77 MB(VRAM) // 1792*1792 + n: "GuessSDXLRefinerDiffusionModelMemoryUsage", + x2y: map[float64]float64{ + 256 * 256: 44.57, + 512 * 512: 154.40, + 1024 * 1024: 968.43, + 1024 * 1536: 2013.12, + 1024 * 1792: 2679.46, + 1536 * 1536: 4300.15, + 1792 * 1792: 7752.77, + }, + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 5767.67 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 10031.87 MB(VRAM) // 1792*2048 + c: map[float64]float64{ + 1536 * 1792: 5767.67, + 1792 * 2048: 10031.87, + }, + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 44.57 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 154.40 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 596.43 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 915.12 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1062.46 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1357.15 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1836.02 MB(VRAM) // 1792*1792 + fax2y: map[float64]float64{ + 256 * 256: 44.57, + 512 * 512: 154.40, + 1024 * 1024: 596.43, + 1024 * 1536: 915.12, + 1024 * 1792: 1062.46, + 1536 * 1536: 1357.15, + 1792 * 1792: 1836.02, + }, + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 1578.17 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - unet compute buffer size: 2014.02 MB(VRAM) // 1792*2048 + fac: map[float64]float64{ + 1536 * 1792: 1578.17, + 1792 * 2048: 2014.02, + }, + }, + { + n: "GuessSD3MediumDiffusionModelMemoryUsage", + // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 37.09 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 169.64 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 1786.11 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 3824.36 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 5131.48 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 8319.03 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 15141.18 MB(VRAM) // 1792*1792 + x2y: map[float64]float64{ + 256 * 256: 37.09, + 512 * 512: 169.64, + 1024 * 1024: 1786.11, + 1024 * 1536: 3824.36, + 1024 * 1792: 5131.48, + 1536 * 1536: 8319.03, + 1792 * 1792: 15141.18, + }, + // [DEBUG] ggml_extend.hpp:1034 - mmdit compute buffer size: 11215.71 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 19654.65 MB(VRAM) // 1792*2048 + c: map[float64]float64{ + 1536 * 1792: 11215.71, + 1792 * 2048: 19654.65, + }, + }, + { + n: "GuessSD35MediumDiffusionModelMemoryUsage", + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 41.48 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 181.64 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 1834.11 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 3896.36 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 5215.48 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 8427.03 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 15288.18 MiB(VRAM) // 1792*1792 + x2y: map[float64]float64{ + 256 * 256: 41.48, + 512 * 512: 181.64, + 1024 * 1024: 1834.11, + 1024 * 1536: 3896.36, + 1024 * 1792: 5215.48, + 1536 * 1536: 8427.03, + 1792 * 1792: 15288.18, + }, + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 11341.71 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 19822.65 MB(VRAM) // 1792*2048 + c: map[float64]float64{ + 1536 * 1792: 11341.71, + 1792 * 2048: 19822.65, + }, + }, + { + n: "GuessSD35LargeDiffusionModelMemoryUsage", + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 57.27 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 276.54 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 2865.44 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 6109.95 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 8188.92 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 13258.86 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 24091.01 MiB(VRAM) // 1792*1792 + x2y: map[float64]float64{ + 256 * 256: 57.27, + 512 * 512: 276.54, + 1024 * 1024: 2865.44, + 1024 * 1536: 6109.95, + 1024 * 1792: 8188.92, + 1536 * 1536: 13258.86, + 1792 * 1792: 24091.01, + }, + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 17859.31 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - mmdit compute buffer size: 31253.70 MB(VRAM) // 1792*2048 + c: map[float64]float64{ + 1536 * 1792: 17859.31, + 1792 * 2048: 31253.70, + }, + }, + { + n: "GuessFLUXDiffusionModelMemoryUsage", + // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 103.35 MB(VRAM) // 256*256 + // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 398.05 MB(VRAM) // 512*512 + // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 2576.18 MB(VRAM) // 1024*1024 + // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 4978.31 MB(VRAM) // 1024*1536 + // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 6467.37 MB(VRAM) // 1024*1792 + // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 10021.49 MB(VRAM) // 1536*1536 + // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 17434.95 MB(VRAM) // 1792*1792 + x2y: map[float64]float64{ + 256 * 256: 103.35, + 512 * 512: 398.05, + 1024 * 1024: 2576.18, + 1024 * 1536: 4978.31, + 1024 * 1792: 6467.37, + 1536 * 1536: 10021.49, + 1792 * 1792: 17434.95, + }, + // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 13191.09 MB(VRAM) // 1536*1792 + // [DEBUG] ggml_extend.hpp:1031 - flux compute buffer size: 22266.81 MB(VRAM) // 1792*2048 + c: map[float64]float64{ + 1536 * 1792: 13191.09, + 1792 * 2048: 22266.81, + }, + }, + } + + rs := make([]Regression, len(ts)) + for i, t := range ts { + rs[i].Name = t.n + } + + fmt.Println("Polynomial Regression For None Flash Attention") + for i, t := range ts { + pr := PolynomialRegression{ + Degree: 2, + } + + xs, ys := make([]float64, 0, len(t.x2y)), make([]float64, 0, len(t.x2y)) + for x, y := range t.x2y { + xs = append(xs, x) + ys = append(ys, y*1024*1024) // MB to B + } + pr.Fit(xs, ys) + + for x, y := range t.c { + y_ := pr.Predict(x) / 1024 / 1024 // B to MB + d := (y_ - y) / y * 100 + s := "+" + if d < 0 { + s = "" + } + c := "" + if d > 10 { + c = "?" + } + + fmt.Printf("%50s: y': %10.2f | y: %10.2f | d: %10s%% %s\n", t.n, y_, y, s+strconv.FormatFloat(d, 'f', 6, 64), c) + } + + rs[i].PolynomialRegression = &pr + } + + fmt.Println("Linear Regression For Flash Attention") + for i, t := range ts { + if len(t.fax2y) == 0 { + continue + } + + lr := LinearRegression{} + + xs, ys := make([]float64, 0, len(t.fax2y)), make([]float64, 0, len(t.fax2y)) + for x, y := range t.fax2y { + xs = append(xs, x) + ys = append(ys, y*1024*1024) // MB to B + } + lr.Fit(xs, ys) + + for x, y := range t.fac { + y_ := lr.Predict(x) / 1024 / 1024 // B to MB + d := (y_ - y) / y * 100 + s := "+" + if d < 0 { + s = "" + } + c := "" + if d > 10 { + c = "?" + } + + fmt.Printf("%50s: y': %10.2f | y: %10.2f | d: %10s%% %s\n", t.n, y_, y, s+strconv.FormatFloat(d, 'f', 6, 64), c) + } + + rs[i].LinearRegression = &lr + } + + var code []byte + { + var ( + buff bytes.Buffer + err error + ) + tmpl := template.Must(template.New("tmpl").Parse(tmplStr)) + if err = tmpl.Execute(&buff, rs); err != nil { + panic(fmt.Errorf("failed to execute template: %w", err)) + } + code, err = format.Source(buff.Bytes()) + if err != nil { + panic(fmt.Errorf("failed to format source: %w", err)) + } + } + + if err := os.WriteFile(output, code, 0644); err != nil { + panic(fmt.Errorf("failed to write file: %w", err)) + } +} + +func main() { + DiffusionModelMemoryUsageRegression("zz_generated.diffusion_model_memory_usage.regression.go") +} diff --git a/gen.stringer.go b/gen.stringer.go index faa8692..0926dc8 100644 --- a/gen.stringer.go +++ b/gen.stringer.go @@ -1,4 +1,4 @@ -//go:build tools +//go:build stringer //go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFMagic -output zz_generated.ggufmagic.stringer.go -trimprefix GGUFMagic //go:generate go run golang.org/x/tools/cmd/stringer -linecomment -type GGUFVersion -output zz_generated.ggufversion.stringer.go -trimprefix GGUFVersion diff --git a/go.mod b/go.mod index 30fd5a8..c16e57b 100644 --- a/go.mod +++ b/go.mod @@ -16,6 +16,7 @@ require ( golang.org/x/sync v0.9.0 golang.org/x/sys v0.27.0 golang.org/x/tools v0.27.0 + gonum.org/v1/gonum v0.15.1 ) require ( diff --git a/go.sum b/go.sum index 9645657..99b23c9 100644 --- a/go.sum +++ b/go.sum @@ -36,6 +36,8 @@ golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= golang.org/x/tools v0.27.0 h1:qEKojBykQkQ4EynWy4S8Weg69NumxKdn40Fce3uc/8o= golang.org/x/tools v0.27.0/go.mod h1:sUi0ZgbwW9ZPAq26Ekut+weQPR5eIM6GQLQ1Yjm1H0Q= +gonum.org/v1/gonum v0.15.1 h1:FNy7N6OUZVUaWG9pTiD+jlhdQ3lMP+/LcTpJ6+a8sQ0= +gonum.org/v1/gonum v0.15.1/go.mod h1:eZTZuRFrzu5pcyjN5wJhcIhnUdNijYxX1T2IcrOGY0o= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= diff --git a/zz_generated.diffusion_model_memory_usage.regression.go b/zz_generated.diffusion_model_memory_usage.regression.go new file mode 100644 index 0000000..5f870f1 --- /dev/null +++ b/zz_generated.diffusion_model_memory_usage.regression.go @@ -0,0 +1,130 @@ +package gguf_parser + +import "math" + +// GuessSD1DiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, +// which is calculated by linear regression or polynomial regression. +func GuessSD1DiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { + coefficients := []float64{7.8763685671743e+06, 161.42301986333496, 0.007812489338703485} + degree := 2 + x := float64(width * height) + + y := float64(0) + for i := 0; i <= degree; i++ { + y += coefficients[i] * math.Pow(x, float64(i)) + } + return uint64(y) +} + +// GuessSD2DiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, +// which is calculated by linear regression or polynomial regression. +func GuessSD2DiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { + coefficients := []float64{-3.5504397905618614e+08, -1193.3271458642232, 0.005402381760522009} + degree := 2 + x := float64(width * height) + + if flashAttention { + coefficients = []float64{3.78068128077788e+06, 513.2102510934714} + degree = 1 + } + + y := float64(0) + for i := 0; i <= degree; i++ { + y += coefficients[i] * math.Pow(x, float64(i)) + } + return uint64(y) +} + +// GuessSDXLDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, +// which is calculated by linear regression or polynomial regression. +func GuessSDXLDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { + coefficients := []float64{5.554129038929968e+07, 138.31961166554433, 0.0006109454572342757} + degree := 2 + x := float64(width * height) + + if flashAttention { + coefficients = []float64{-5.95880278052181e+06, 500.0687898914631} + degree = 1 + } + + y := float64(0) + for i := 0; i <= degree; i++ { + y += coefficients[i] * math.Pow(x, float64(i)) + } + return uint64(y) +} + +// GuessSDXLRefinerDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, +// which is calculated by linear regression or polynomial regression. +func GuessSDXLRefinerDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { + coefficients := []float64{4.939599234485548e+07, 155.2477810191175, 0.0007351735797614931} + degree := 2 + x := float64(width * height) + + if flashAttention { + coefficients = []float64{7.0313433199802125e+06, 599.4137437226634} + degree = 1 + } + + y := float64(0) + for i := 0; i <= degree; i++ { + y += coefficients[i] * math.Pow(x, float64(i)) + } + return uint64(y) +} + +// GuessSD3MediumDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, +// which is calculated by linear regression or polynomial regression. +func GuessSD3MediumDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { + coefficients := []float64{1.6529921370035086e+07, 234.66562477184195, 0.0014648995324747492} + degree := 2 + x := float64(width * height) + + y := float64(0) + for i := 0; i <= degree; i++ { + y += coefficients[i] * math.Pow(x, float64(i)) + } + return uint64(y) +} + +// GuessSD35MediumDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, +// which is calculated by linear regression or polynomial regression. +func GuessSD35MediumDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { + coefficients := []float64{1.7441103472644456e+07, 281.695681980568, 0.0014651233076620938} + degree := 2 + x := float64(width * height) + + y := float64(0) + for i := 0; i <= degree; i++ { + y += coefficients[i] * math.Pow(x, float64(i)) + } + return uint64(y) +} + +// GuessSD35LargeDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, +// which is calculated by linear regression or polynomial regression. +func GuessSD35LargeDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { + coefficients := []float64{2.320436920291992e+07, 410.3731196298318, 0.002319594715894278} + degree := 2 + x := float64(width * height) + + y := float64(0) + for i := 0; i <= degree; i++ { + y += coefficients[i] * math.Pow(x, float64(i)) + } + return uint64(y) +} + +// GuessFLUXDiffusionModelMemoryUsage returns the memory usage in bytes for the given width and height, +// which is calculated by linear regression or polynomial regression. +func GuessFLUXDiffusionModelMemoryUsage(width, height uint32, flashAttention bool) uint64 { + coefficients := []float64{4.651166867423782e+07, 997.7758807792155, 0.001457339256095295} + degree := 2 + x := float64(width * height) + + y := float64(0) + for i := 0; i <= degree; i++ { + y += coefficients[i] * math.Pow(x, float64(i)) + } + return uint64(y) +}