diff --git a/examples/running-llamas/README.md b/examples/running-llamas/README.md index a2e9678d..4abfca9f 100644 --- a/examples/running-llamas/README.md +++ b/examples/running-llamas/README.md @@ -1,4 +1,4 @@ -# Optimum-Benchmark x LLaMAs x BnB & GPTQ +# Optimum-Benchmark x LLaMAs x GPTQ A set of benchmarks on Meta's LLaMA2's inference. @@ -7,7 +7,6 @@ A set of benchmarks on Meta's LLaMA2's inference. You will need to install these quantization packages: ```bash -pip install bitsandbytes pip install auto-gptq # or install it from source ``` @@ -17,11 +16,10 @@ Then run these commands from this directory: ```bash optimum-benchmark --config-dir configs/ --config-name _base_ --multirun -optimum-benchmark --config-dir configs/ --config-name bnb --multirun optimum-benchmark --config-dir configs/ --config-name gptq --multirun ``` -This will create a folder called `experiments` with the results of the benchmarks with an inference `batch_size` ranging from 1 to 16 and an input `sequence_length` (prompt size) of 512. +This will create a folder called `experiments` with the results of the benchmarks with an inference `batch_size` ranging from 1 to 16 and an input `sequence_length` (prompt size) of 256. ## Reporting diff --git a/examples/running-llamas/artifacts/A100-80GB/forward_latency_plot.png b/examples/running-llamas/artifacts/A100-80GB/forward_latency_plot.png index 12ccb42b..4e7dd369 100644 Binary files a/examples/running-llamas/artifacts/A100-80GB/forward_latency_plot.png and b/examples/running-llamas/artifacts/A100-80GB/forward_latency_plot.png differ diff --git a/examples/running-llamas/artifacts/A100-80GB/forward_memory_plot.png b/examples/running-llamas/artifacts/A100-80GB/forward_memory_plot.png index c2cb4bd1..5bb39d63 100644 Binary files a/examples/running-llamas/artifacts/A100-80GB/forward_memory_plot.png and b/examples/running-llamas/artifacts/A100-80GB/forward_memory_plot.png differ diff --git a/examples/running-llamas/artifacts/A100-80GB/full_report.csv b/examples/running-llamas/artifacts/A100-80GB/full_report.csv index f179eee7..f05eb1e4 100644 --- a/examples/running-llamas/artifacts/A100-80GB/full_report.csv +++ b/examples/running-llamas/artifacts/A100-80GB/full_report.csv @@ -1,11 +1,11 @@ experiment_name,backend.name,backend.version,backend._target_,backend.seed,backend.inter_op_num_threads,backend.intra_op_num_threads,backend.initial_isolation_check,backend.continous_isolation_check,backend.delete_cache,backend.no_weights,backend.device_map,backend.torch_dtype,backend.disable_grad,backend.eval_mode,backend.amp_autocast,backend.amp_dtype,backend.torch_compile,backend.bettertransformer,backend.quantization_scheme,backend.use_ddp,backend.peft_strategy,benchmark.name,benchmark._target_,benchmark.duration,benchmark.warmup_runs,benchmark.memory,benchmark.energy,benchmark.input_shapes.batch_size,benchmark.input_shapes.sequence_length,benchmark.input_shapes.num_choices,benchmark.input_shapes.feature_size,benchmark.input_shapes.nb_max_frames,benchmark.input_shapes.audio_sequence_length,benchmark.new_tokens,benchmark.can_diffuse,benchmark.can_generate,benchmark.generate_kwargs.max_new_tokens,benchmark.generate_kwargs.min_new_tokens,benchmark.generate_kwargs.do_sample,benchmark.generate_kwargs.use_cache,benchmark.generate_kwargs.pad_token_id,benchmark.generate_kwargs.num_beams,model,device,task,hub_kwargs.revision,hub_kwargs.cache_dir,hub_kwargs.force_download,hub_kwargs.local_files_only,environment.optimum_version,environment.optimum_commit,environment.transformers_version,environment.transformers_commit,environment.accelerate_version,environment.accelerate_commit,environment.diffusers_version,environment.diffusers_commit,environment.python_version,environment.system,environment.cpu,environment.cpu_count,environment.cpu_ram_mb,environment.gpus,forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -fp16-batch_size(16)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,16,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,['NVIDIA A100-SXM4-80GB'],0.402,39.8,26889,19191,24591,17.3,474.0,79295,26441,76996 -fp16-batch_size(8)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,8,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,['NVIDIA A100-SXM4-80GB'],0.202,39.6,26369,16372,24071,14.0,293.0,32302,19990,30003 -gptq-batch_size(16)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,16,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,['NVIDIA A100-SXM4-80GB'],0.415,38.6,13652,9752,11353,24.6,333.0,83302,17001,81004 -fp16-batch_size(4)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,4,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,['NVIDIA A100-SXM4-80GB'],0.106,37.7,25843,14962,23544,13.7,149.0,25843,16768,23544 -gptq-batch_size(8)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,8,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,['NVIDIA A100-SXM4-80GB'],0.22,36.4,10252,6933,7954,19.8,207.0,44107,10556,41808 -fp16-batch_size(2)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,2,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,['NVIDIA A100-SXM4-80GB'],0.0574,34.8,25840,14258,23542,13.7,74.7,25840,15160,23542 -gptq-batch_size(4)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,4,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,['NVIDIA A100-SXM4-80GB'],0.121,33.1,8541,5524,6243,15.2,135.0,16399,7334,14101 -fp16-batch_size(1)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,1,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,['NVIDIA A100-SXM4-80GB'],0.0325,30.8,25840,13906,23542,13.2,38.8,25840,14356,23542 -gptq-batch_size(2)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,2,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,['NVIDIA A100-SXM4-80GB'],0.0703,28.4,7167,4818,4869,15.0,68.3,8728,5722,6429 -gptq-batch_size(1)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,1,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,['NVIDIA A100-SXM4-80GB'],0.0457,21.9,6803,4467,4504,14.6,35.1,7599,4916,5301 +fp16-batch_size(16)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,16,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda:0,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.402,39.8,19165,16520,17779,17.4,471.0,27988,26442,84511 +fp16-batch_size(8)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,8,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.204,39.2,17087,15037,15701,14.1,290.0,64889,19997,63503 +gptq-batch_size(16)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,16,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda:0,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.415,38.6,10900,7080,8604,24.6,333.0,65676,17002,83596 +fp16-batch_size(4)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,4,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.107,37.4,16022,14295,14636,13.9,147.0,26346,16774,24960 +gptq-batch_size(8)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,8,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.223,35.9,8826,5597,6530,19.9,206.0,56629,10557,54333 +fp16-batch_size(2)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,2,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.0579,34.5,15392,13924,14006,13.6,75.3,17003,15162,15617 +gptq-batch_size(4)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,4,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.122,32.8,7761,4855,5465,15.3,134.0,18085,7335,15789 +fp16-batch_size(1)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,1,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.0328,30.5,15153,13738,13767,13.5,37.9,15866,14356,14480 +gptq-batch_size(2)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,2,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.0706,28.3,6872,4484,4575,15.4,66.5,8822,5722,6526 +gptq-batch_size(1)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,1,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.0458,21.8,6746,4298,4450,14.8,34.6,7606,4916,5309 diff --git a/examples/running-llamas/artifacts/A100-80GB/generate_memory_plot.png b/examples/running-llamas/artifacts/A100-80GB/generate_memory_plot.png index c1b08f97..750d228a 100644 Binary files a/examples/running-llamas/artifacts/A100-80GB/generate_memory_plot.png and b/examples/running-llamas/artifacts/A100-80GB/generate_memory_plot.png differ diff --git a/examples/running-llamas/artifacts/A100-80GB/generate_throughput_plot.png b/examples/running-llamas/artifacts/A100-80GB/generate_throughput_plot.png index 22f0daaf..dafd743e 100644 Binary files a/examples/running-llamas/artifacts/A100-80GB/generate_throughput_plot.png and b/examples/running-llamas/artifacts/A100-80GB/generate_throughput_plot.png differ diff --git a/examples/running-llamas/artifacts/A100-80GB/rich_table.svg b/examples/running-llamas/artifacts/A100-80GB/rich_table.svg index 29ee3e07..cc440363 100644 --- a/examples/running-llamas/artifacts/A100-80GB/rich_table.svg +++ b/examples/running-llamas/artifacts/A100-80GB/rich_table.svg @@ -1,4 +1,4 @@ - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - Inference Report + Inference Report - + - - ┏━━━━━━━━━━┳━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┓ - ForwardForward ForwardGenerateGenera…Generate -     Max    Max     Max     Max    Max     Max - Forward Forward  Memory Memory  MemoryGenera…  Memory Memory  Memory -Experim…   Batch LatencyThrough…    UsedAlloca…ReservedThroug…    UsedAlloca…ReservedQuanti… -Name     GPU    Size     (s)(sample…    (MB)   (MB)    (MB)(token…    (MB)   (MB)    (MB) Scheme -┡━━━━━━━━━━╇━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━┩ -fp16-ba…A100      164.02e-01   39.80   26889  19191   24591 474.00   79295  26441   76996A100-f… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┤ -fp16-ba…A100       82.02e-01   39.60   26369  16372   24071 293.00   32302  19990   30003A100-f… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┤ -gptq-ba…A100      164.15e-01   38.60   13652   9752   11353 333.00   83302  17001   81004A100-G… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┤ -fp16-ba…A100       41.06e-01   37.70   25843  14962   23544 149.00   25843  16768   23544A100-f… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┤ -gptq-ba…A100       82.20e-01   36.40   10252   6933    7954 207.00   44107  10556   41808A100-G… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┤ -fp16-ba…A100       25.74e-02   34.80   25840  14258   23542  74.70   25840  15160   23542A100-f… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┤ -gptq-ba…A100       41.21e-01   33.10    8541   5524    6243 135.00   16399   7334   14101A100-G… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┤ -fp16-ba…A100       13.25e-02   30.80   25840  13906   23542  38.80   25840  14356   23542A100-f… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┤ -gptq-ba…A100       27.03e-02   28.40    7167   4818    4869  68.30    8728   5722    6429A100-G… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┼──────────┼─────────┤ -gptq-ba…A100       14.57e-02   21.90    6803   4467    4504  35.10    7599   4916    5301A100-G… -└──────────┴──────┴──────────┴──────────┴──────────┴──────────┴─────────┴──────────┴─────────┴──────────┴─────────┴──────────┴─────────┘ + + ┏━━━━━━━━━┳━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┳━━━━━━━━━┳━━━━━━━━┓ +ForwardForwardForwa…Gener…Genera…Gener… +    Max    Max   Max   Max    Max   Max +ForwardForward Memory MemoryMemoryGenera…Memory MemoryMemory +Experi…  BatchLatencyThroug…   UsedAlloca…Reser…Throug…  UsedAlloca…Reser…Quanti… +Name    GPU   Size    (s)(sampl…   (MB)   (MB)  (MB)(token…  (MB)   (MB)  (MB) Scheme Group +┡━━━━━━━━━╇━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━╇━━━━━━━━━╇━━━━━━━━┩ +fp16-b…A100     164.02e-…  39.80  19165  16520 17779 471.00 27988  26442 84511   fp16A100-… +├─────────┼──────┼─────────┼─────────┼─────────┼─────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┤ +fp16-b…A100      82.04e-…  39.20  17087  15037 15701 290.00 64889  19997 63503   fp16A100-… +├─────────┼──────┼─────────┼─────────┼─────────┼─────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┤ +gptq-b…A100     164.15e-…  38.60  10900   7080  8604 333.00 65676  17002 83596   GPTQA100-… +├─────────┼──────┼─────────┼─────────┼─────────┼─────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┤ +fp16-b…A100      41.07e-…  37.40  16022  14295 14636 147.00 26346  16774 24960   fp16A100-… +├─────────┼──────┼─────────┼─────────┼─────────┼─────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┤ +gptq-b…A100      82.23e-…  35.90   8826   5597  6530 206.00 56629  10557 54333   GPTQA100-… +├─────────┼──────┼─────────┼─────────┼─────────┼─────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┤ +fp16-b…A100      25.79e-…  34.50  15392  13924 14006  75.30 17003  15162 15617   fp16A100-… +├─────────┼──────┼─────────┼─────────┼─────────┼─────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┤ +gptq-b…A100      41.22e-…  32.80   7761   4855  5465 134.00 18085   7335 15789   GPTQA100-… +├─────────┼──────┼─────────┼─────────┼─────────┼─────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┤ +fp16-b…A100      13.28e-…  30.50  15153  13738 13767  37.90 15866  14356 14480   fp16A100-… +├─────────┼──────┼─────────┼─────────┼─────────┼─────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┤ +gptq-b…A100      27.06e-…  28.30   6872   4484  4575  66.50  8822   5722  6526   GPTQA100-… +├─────────┼──────┼─────────┼─────────┼─────────┼─────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┼─────────┼────────┤ +gptq-b…A100      14.58e-…  21.80   6746   4298  4450  34.60  7606   4916  5309   GPTQA100-… +└─────────┴──────┴─────────┴─────────┴─────────┴─────────┴─────────┴────────┴─────────┴────────┴─────────┴────────┴─────────┴────────┘ diff --git a/examples/running-llamas/artifacts/A100-80GB/short_report.csv b/examples/running-llamas/artifacts/A100-80GB/short_report.csv index aa797d40..595a9dab 100644 --- a/examples/running-llamas/artifacts/A100-80GB/short_report.csv +++ b/examples/running-llamas/artifacts/A100-80GB/short_report.csv @@ -1,11 +1,11 @@ experiment_name,GPU,Batch Size,Forward Latency (s),Forward Throughput (samples/s),Forward Max Memory Used (MB),Forward Max Memory Allocated (MB),Forward Max Memory Reserved (MB),Generate Throughput (tokens/s),Generate Max Memory Used (MB),Generate Max Memory Allocated (MB),Generate Max Memory Reserved (MB),Quantization Scheme -fp16-batch_size(16)-sequence_length(256)-new_tokens(512),A100,16,0.402,39.8,26889,19191,24591,474.0,79295,26441,76996,fp16 -fp16-batch_size(8)-sequence_length(256)-new_tokens(512),A100,8,0.202,39.6,26369,16372,24071,293.0,32302,19990,30003,fp16 -gptq-batch_size(16)-sequence_length(256)-new_tokens(512),A100,16,0.415,38.6,13652,9752,11353,333.0,83302,17001,81004,GPTQ -fp16-batch_size(4)-sequence_length(256)-new_tokens(512),A100,4,0.106,37.7,25843,14962,23544,149.0,25843,16768,23544,fp16 -gptq-batch_size(8)-sequence_length(256)-new_tokens(512),A100,8,0.22,36.4,10252,6933,7954,207.0,44107,10556,41808,GPTQ -fp16-batch_size(2)-sequence_length(256)-new_tokens(512),A100,2,0.0574,34.8,25840,14258,23542,74.7,25840,15160,23542,fp16 -gptq-batch_size(4)-sequence_length(256)-new_tokens(512),A100,4,0.121,33.1,8541,5524,6243,135.0,16399,7334,14101,GPTQ -fp16-batch_size(1)-sequence_length(256)-new_tokens(512),A100,1,0.0325,30.8,25840,13906,23542,38.8,25840,14356,23542,fp16 -gptq-batch_size(2)-sequence_length(256)-new_tokens(512),A100,2,0.0703,28.4,7167,4818,4869,68.3,8728,5722,6429,GPTQ -gptq-batch_size(1)-sequence_length(256)-new_tokens(512),A100,1,0.0457,21.9,6803,4467,4504,35.1,7599,4916,5301,GPTQ +fp16-batch_size(16)-sequence_length(256)-new_tokens(512),A100,16,0.402,39.8,19165,16520,17779,471.0,27988,26442,84511,fp16 +fp16-batch_size(8)-sequence_length(256)-new_tokens(512),A100,8,0.204,39.2,17087,15037,15701,290.0,64889,19997,63503,fp16 +gptq-batch_size(16)-sequence_length(256)-new_tokens(512),A100,16,0.415,38.6,10900,7080,8604,333.0,65676,17002,83596,GPTQ +fp16-batch_size(4)-sequence_length(256)-new_tokens(512),A100,4,0.107,37.4,16022,14295,14636,147.0,26346,16774,24960,fp16 +gptq-batch_size(8)-sequence_length(256)-new_tokens(512),A100,8,0.223,35.9,8826,5597,6530,206.0,56629,10557,54333,GPTQ +fp16-batch_size(2)-sequence_length(256)-new_tokens(512),A100,2,0.0579,34.5,15392,13924,14006,75.3,17003,15162,15617,fp16 +gptq-batch_size(4)-sequence_length(256)-new_tokens(512),A100,4,0.122,32.8,7761,4855,5465,134.0,18085,7335,15789,GPTQ +fp16-batch_size(1)-sequence_length(256)-new_tokens(512),A100,1,0.0328,30.5,15153,13738,13767,37.9,15866,14356,14480,fp16 +gptq-batch_size(2)-sequence_length(256)-new_tokens(512),A100,2,0.0706,28.3,6872,4484,4575,66.5,8822,5722,6526,GPTQ +gptq-batch_size(1)-sequence_length(256)-new_tokens(512),A100,1,0.0458,21.8,6746,4298,4450,34.6,7606,4916,5309,GPTQ diff --git a/examples/running-llamas/configs/_base_.yaml b/examples/running-llamas/configs/_base_.yaml index 46c1944a..23a3eaa4 100644 --- a/examples/running-llamas/configs/_base_.yaml +++ b/examples/running-llamas/configs/_base_.yaml @@ -22,7 +22,7 @@ hydra: experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: meta-llama/Llama-2-7b-hf -device: cuda +device: cuda:0 backend: initial_isolation_check: false diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml index 731159f0..d2cd4143 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml @@ -45,7 +45,7 @@ benchmark: generate_kwargs: {} experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: meta-llama/Llama-2-7b-hf -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -68,3 +68,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml index e118a4ed..41af721d 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml @@ -130,7 +130,7 @@ hydra: num: 0 config_name: _base_ env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml index 4f2d08cd..6d8478b5 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml @@ -74,3 +74,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv index 6d7a78c1..8161550f 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv @@ -1,2 +1,2 @@ forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.0325,30.8,25840,13906,23542,13.2,38.8,25840,14356,23542 +0.0328,30.5,15153,13738,13767,13.5,37.9,15866,14356,14480 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml index ca63235e..88fed34b 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml @@ -45,7 +45,7 @@ benchmark: generate_kwargs: {} experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: meta-llama/Llama-2-7b-hf -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -68,3 +68,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml index 657c8eca..1b047280 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml @@ -130,7 +130,7 @@ hydra: num: 4 config_name: _base_ env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml index b4f7b466..64882a0b 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml @@ -51,7 +51,7 @@ benchmark: num_beams: 1 experiment_name: fp16-batch_size(16)-sequence_length(256)-new_tokens(512) model: meta-llama/Llama-2-7b-hf -device: cuda +device: cuda:0 task: text-generation hub_kwargs: revision: main @@ -74,3 +74,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv index 3943e0e2..bad3d7ca 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv @@ -1,2 +1,2 @@ forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.402,39.8,26889,19191,24591,17.3,474.0,79295,26441,76996 +0.402,39.8,19165,16520,17779,17.4,471.0,27988,26442,84511 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml index 7d71e774..f4197a4f 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml @@ -45,7 +45,7 @@ benchmark: generate_kwargs: {} experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: meta-llama/Llama-2-7b-hf -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -68,3 +68,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml index 695b1b9b..7ef5862d 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml @@ -130,7 +130,7 @@ hydra: num: 1 config_name: _base_ env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml index e20df58e..1de85dc8 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml @@ -74,3 +74,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv index 430e8ab8..42a8985a 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv @@ -1,2 +1,2 @@ forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.0574,34.8,25840,14258,23542,13.7,74.7,25840,15160,23542 +0.0579,34.5,15392,13924,14006,13.6,75.3,17003,15162,15617 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml index 2b9217a2..6de5aaf7 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml @@ -129,7 +129,7 @@ hydra: num: ??? config_name: _base_ env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: @@ -218,7 +218,7 @@ benchmark: generate_kwargs: {} experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: meta-llama/Llama-2-7b-hf -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -241,3 +241,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml index b5f39467..589d9f13 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml @@ -45,7 +45,7 @@ benchmark: generate_kwargs: {} experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: meta-llama/Llama-2-7b-hf -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -68,3 +68,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml index 7f492517..8e3abfc7 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml @@ -130,7 +130,7 @@ hydra: num: 2 config_name: _base_ env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml index 7d0d276a..eb07905d 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml @@ -74,3 +74,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv index 5e3f025f..3a40d093 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv @@ -1,2 +1,2 @@ forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.106,37.7,25843,14962,23544,13.7,149.0,25843,16768,23544 +0.107,37.4,16022,14295,14636,13.9,147.0,26346,16774,24960 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml index e9d5b0c0..ff7402ca 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml @@ -45,7 +45,7 @@ benchmark: generate_kwargs: {} experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: meta-llama/Llama-2-7b-hf -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -68,3 +68,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml index cf6bb64f..ed654ac1 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml @@ -130,7 +130,7 @@ hydra: num: 3 config_name: _base_ env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml index fe52203f..a180796d 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml @@ -74,3 +74,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv index f51d29e0..7fdc7a5f 100644 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv +++ b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv @@ -1,2 +1,2 @@ forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.202,39.6,26369,16372,24071,14.0,293.0,32302,19990,30003 +0.204,39.2,17087,15037,15701,14.1,290.0,64889,19997,63503 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml index 0d9b94ce..03090426 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml @@ -45,7 +45,7 @@ benchmark: generate_kwargs: {} experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: TheBloke/Llama-2-7B-GPTQ -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -68,3 +68,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml index ad31c5c9..65926c1f 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml @@ -130,7 +130,7 @@ hydra: num: 0 config_name: gptq env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml index cc0d39b8..256e0e54 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml @@ -74,3 +74,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv index bb298e56..da04a0c3 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv @@ -1,2 +1,2 @@ forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.0457,21.9,6803,4467,4504,14.6,35.1,7599,4916,5301 +0.0458,21.8,6746,4298,4450,14.8,34.6,7606,4916,5309 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml index 8447790b..58457ece 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml @@ -45,7 +45,7 @@ benchmark: generate_kwargs: {} experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: TheBloke/Llama-2-7B-GPTQ -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -68,3 +68,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml index b4c13c3d..c9e56066 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml @@ -130,7 +130,7 @@ hydra: num: 4 config_name: gptq env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml index b91d6e1f..ad91270b 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml @@ -51,7 +51,7 @@ benchmark: num_beams: 1 experiment_name: gptq-batch_size(16)-sequence_length(256)-new_tokens(512) model: TheBloke/Llama-2-7B-GPTQ -device: cuda +device: cuda:0 task: text-generation hub_kwargs: revision: main @@ -74,3 +74,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv index d94b1e51..0b114e75 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv @@ -1,2 +1,2 @@ forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.415,38.6,13652,9752,11353,24.6,333.0,83302,17001,81004 +0.415,38.6,10900,7080,8604,24.6,333.0,65676,17002,83596 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml index f3a254f2..90ddbb6e 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml @@ -45,7 +45,7 @@ benchmark: generate_kwargs: {} experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: TheBloke/Llama-2-7B-GPTQ -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -68,3 +68,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml index b8de14e2..77d220f4 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml @@ -130,7 +130,7 @@ hydra: num: 1 config_name: gptq env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml index 7aa2219c..d946b75c 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml @@ -74,3 +74,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv index 1c82ad12..1797a299 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv @@ -1,2 +1,2 @@ forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.0703,28.4,7167,4818,4869,15.0,68.3,8728,5722,6429 +0.0706,28.3,6872,4484,4575,15.4,66.5,8822,5722,6526 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml index 68a3b9cd..667d738e 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml @@ -129,7 +129,7 @@ hydra: num: ??? config_name: gptq env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: @@ -218,7 +218,7 @@ benchmark: generate_kwargs: {} experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: TheBloke/Llama-2-7B-GPTQ -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -241,3 +241,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml index 5824d005..97f81007 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml @@ -45,7 +45,7 @@ benchmark: generate_kwargs: {} experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: TheBloke/Llama-2-7B-GPTQ -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -68,3 +68,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml index 83443ee3..d5efb914 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml @@ -130,7 +130,7 @@ hydra: num: 2 config_name: gptq env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml index e51408dd..eb2c9cbd 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml @@ -74,3 +74,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv index 99188f0c..e54a036c 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv @@ -1,2 +1,2 @@ forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.121,33.1,8541,5524,6243,15.2,135.0,16399,7334,14101 +0.122,32.8,7761,4855,5465,15.3,134.0,18085,7335,15789 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml index 63781dea..3a4085f7 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml @@ -45,7 +45,7 @@ benchmark: generate_kwargs: {} experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) model: TheBloke/Llama-2-7B-GPTQ -device: cuda +device: cuda:0 task: ${infer_task:${model}} hub_kwargs: revision: main @@ -68,3 +68,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml index ac702a9f..5f8fa193 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml @@ -130,7 +130,7 @@ hydra: num: 3 config_name: gptq env_set: - CUDA_VISIBLE_DEVICES: '0' + CUDA_VISIBLE_DEVICES: '3' CUDA_DEVICE_ORDER: PCI_BUS_ID env_copy: [] config: diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml index 0d6a5583..06e20479 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml @@ -74,3 +74,6 @@ environment: cpu_ram_mb: 540684 gpus: - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB + - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv index 202fed5e..5c52e357 100644 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv +++ b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv @@ -1,2 +1,2 @@ forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.22,36.4,10252,6933,7954,19.8,207.0,44107,10556,41808 +0.223,35.9,8826,5597,6530,19.9,206.0,56629,10557,54333 diff --git a/examples/running-llamas/report.py b/examples/running-llamas/report.py index bd309584..bcf74191 100644 --- a/examples/running-llamas/report.py +++ b/examples/running-llamas/report.py @@ -101,6 +101,8 @@ def get_short_report(inference_report): short_report["GPU"].replace("AMD INSTINCT MI250 (MCM) OAM AC MBA", "MI250", inplace=True) short_report["GPU"].replace("NVIDIA A100-SXM4-80GB", "A100", inplace=True) + short_report["Group"] = short_report["GPU"] + "-" + short_report["Quantization Scheme"] + return short_report @@ -126,10 +128,8 @@ def get_throughput_plot(short_report): fig3, ax3 = plt.subplots() fig4, ax4 = plt.subplots() - short_report["Quantization Scheme"] = short_report["GPU"] + "-" + short_report["Quantization Scheme"] - - for quantization_scheme in short_report["Quantization Scheme"].unique(): - mask = short_report["Quantization Scheme"] == quantization_scheme + for group in short_report["Group"].unique(): + mask = short_report["Group"] == group forward_latency = short_report[mask][["Batch Size", "Forward Latency (s)"]].sort_values(by="Batch Size") generate_throughput = short_report[mask][["Batch Size", "Generate Throughput (tokens/s)"]].sort_values( @@ -157,51 +157,51 @@ def get_throughput_plot(short_report): ax1.plot( forward_latency["Batch Size"], forward_latency["Forward Latency (s)"], - label=quantization_scheme, + label=group, marker="o", ) ax2.plot( generate_throughput["Batch Size"], generate_throughput["Generate Throughput (tokens/s)"], - label=quantization_scheme, + label=group, marker="o", ) ax3.plot( forward_memory["Batch Size"], forward_memory["Forward Max Memory Used (MB)"], - label=quantization_scheme + "-used", - marker="*", + label=group + "-used", + marker="^", + ) + ax3.plot( + forward_pytorch_max_memory_reserved["Batch Size"], + forward_pytorch_max_memory_reserved["Forward Max Memory Reserved (MB)"], + label=group + "-reserved", + marker=".", ) ax3.plot( forward_pytorch_max_memory_allocated["Batch Size"], forward_pytorch_max_memory_allocated["Forward Max Memory Allocated (MB)"], - label=quantization_scheme + "-allocated", + label=group + "-allocated", marker="v", ) - ax3.plot( - forward_pytorch_max_memory_reserved["Batch Size"], - forward_pytorch_max_memory_reserved["Forward Max Memory Reserved (MB)"], - label=quantization_scheme + "-reserved", - marker="^", - ) ax4.plot( generate_memory["Batch Size"], generate_memory["Generate Max Memory Used (MB)"], - label=quantization_scheme + "-used", - marker="*", + label=group + "-used", + marker="^", + ) + ax4.plot( + generate_pytorch_max_memory_reserved["Batch Size"], + generate_pytorch_max_memory_reserved["Generate Max Memory Reserved (MB)"], + label=group + "-reserved", + marker=".", ) ax4.plot( generate_pytorch_max_memory_allocated["Batch Size"], generate_pytorch_max_memory_allocated["Generate Max Memory Allocated (MB)"], - label=quantization_scheme + "-allocated", + label=group + "-allocated", marker="v", ) - ax4.plot( - generate_pytorch_max_memory_reserved["Batch Size"], - generate_pytorch_max_memory_reserved["Generate Max Memory Reserved (MB)"], - label=quantization_scheme + "-reserved", - marker="^", - ) ax1.set_xlabel("Batch Size") ax1.set_ylabel("Forward Latency (s)") diff --git a/optimum_benchmark/benchmarks/inference/benchmark.py b/optimum_benchmark/benchmarks/inference/benchmark.py index 1e978a89..bdee7563 100644 --- a/optimum_benchmark/benchmarks/inference/benchmark.py +++ b/optimum_benchmark/benchmarks/inference/benchmark.py @@ -43,28 +43,42 @@ def configure(self, config: "InferenceConfig"): def run(self, backend: "Backend") -> None: LOGGER.info("Running inference benchmark") + + LOGGER.info("\t+ Updating input shapes with model shapes") self.config.input_shapes.update(backend.model_shapes) + LOGGER.info("\t+ Creating input generator") self.input_generator = InputGenerator( task=backend.task, pretrained_config=backend.pretrained_config, input_shapes=self.config.input_shapes, ) - # openvino requires compiling with static shapes and trt ep requires max tokens + # compile with static shapes if needed + LOGGER.info("\t+ Preparing backend for inference") backend.prepare_for_inference( - input_shapes=self.config.input_shapes, - max_new_tokens=self.config.generate_kwargs.get("max_new_tokens", 0), + input_shapes=self.config.input_shapes, new_tokens=self.config.generate_kwargs["min_new_tokens"] ) - # run forward pass tracking - self.run_forward_tracking(backend) + # run memory tracking + # we do this first to measure the memory on the first call to forward/generate + if self.config.memory: + self.run_forward_memory_tracking(backend) + if self.config.can_generate: + self.run_generate_memory_tracking(backend) + # run lacency tracking + self.run_forward_latency_tracking(backend) if self.config.can_generate: - # if possible, run generation pass tracking - self.run_generate_tracking(backend) + self.run_generate_latency_tracking(backend) - def run_forward_tracking(self, backend: "Backend") -> None: + # run energy tracking + if self.config.energy: + self.run_forward_energy_tracking(backend) + if self.config.can_generate: + self.run_generate_energy_tracking(backend) + + def run_forward_latency_tracking(self, backend: "Backend") -> None: forward_input = self.input_generator.generate(mode="forward") LOGGER.info("\t+ Preparing input for the forward pass") @@ -80,42 +94,52 @@ def run_forward_tracking(self, backend: "Backend") -> None: with latency_tracker.track(): _ = backend.forward(forward_input, self.config.forward_kwargs) self.forward_latencies = latency_tracker.get_latencies() + LOGGER.info(f"\t+ Forward pass latency: {self.forward_latency:.2e} (s)") LOGGER.info(f"\t+ Forward pass throughput: {self.forward_throughput:.2f} (samples/s)") - if self.config.memory: - LOGGER.info("\t+ Tracking forward pass peak memory") - memory_tracker = MemoryTracker(device=backend.device) - with memory_tracker.track(interval=self.forward_latency / 10): + def run_forward_energy_tracking(self, backend: "Backend") -> None: + forward_input = self.input_generator.generate(mode="forward") + + LOGGER.info("\t+ Preparing input for the forward pass") + forward_input = backend.prepare_input(forward_input) + + LOGGER.info("\t+ Tracking forward pass energy consumption") + num_forward_passes = 0 + energy_tracker = EnergyTracker() + with energy_tracker.track(interval=1, file_prefix="forward"): + while energy_tracker.get_elapsed_time() < self.config.duration: _ = backend.forward(forward_input, self.config.forward_kwargs) - self.forward_max_memory_used = memory_tracker.get_max_memory_used() - self.forward_max_memory_allocated = memory_tracker.get_max_memory_allocated() - self.forward_max_memory_reserved = memory_tracker.get_max_memory_reserved() - LOGGER.info(f"\t+ Forward pass max memory used: {self.forward_max_memory_used} (MB)") - LOGGER.info(f"\t+ Forward pass max memory allocated: {self.forward_max_memory_allocated} (MB)") - LOGGER.info(f"\t+ Forward pass max memory reserved: {self.forward_max_memory_reserved} (MB)") + num_forward_passes += 1 + num_forward_samples = num_forward_passes * self.config.input_shapes["batch_size"] + self.forward_energy = extract_three_significant_digits(energy_tracker.get_total_energy() / num_forward_samples) + self.forward_emissions = extract_three_significant_digits( + energy_tracker.get_total_emissions() / num_forward_samples + ) - if self.config.energy: - LOGGER.info("\t+ Tracking forward pass energy consumption") - num_forward_passes = 0 - energy_tracker = EnergyTracker() - with energy_tracker.track(interval=1, file_prefix="forward"): - while energy_tracker.get_elapsed_time() < self.config.duration: - _ = backend.forward(forward_input, self.config.forward_kwargs) - num_forward_passes += 1 - - num_forward_samples = num_forward_passes * self.config.input_shapes["batch_size"] - self.forward_energy = extract_three_significant_digits( - energy_tracker.get_total_energy() / num_forward_samples - ) - self.forward_emissions = extract_three_significant_digits( - energy_tracker.get_total_emissions() / num_forward_samples - ) - LOGGER.info(f"\t+ Forward pass energy consumption: {self.forward_energy} (kWh/sample)") - LOGGER.info(f"\t+ Forward pass carbon emissions: {self.forward_emissions} (kgCO2eq/sample)") - LOGGER.info(f"\t+ Full details in the CodeCarbon report: {os.getcwd()}/forward_codecarbon.csv") - - def run_generate_tracking(self, backend: "Backend") -> None: + LOGGER.info(f"\t+ Forward pass energy consumption: {self.forward_energy} (kWh/sample)") + LOGGER.info(f"\t+ Forward pass carbon emissions: {self.forward_emissions} (kgCO2eq/sample)") + LOGGER.info(f"\t+ Full details in the CodeCarbon report: {os.getcwd()}/forward_codecarbon.csv") + + def run_forward_memory_tracking(self, backend: "Backend") -> None: + forward_input = self.input_generator.generate(mode="forward") + + LOGGER.info("\t+ Preparing input for the forward pass") + forward_input = backend.prepare_input(forward_input) + + LOGGER.info("\t+ Tracking forward pass peak memory") + memory_tracker = MemoryTracker(device=backend.device) + with memory_tracker.track(): + _ = backend.forward(forward_input, self.config.forward_kwargs) + self.forward_max_memory_used = memory_tracker.get_max_memory_used() + self.forward_max_memory_reserved = memory_tracker.get_max_memory_reserved() + self.forward_max_memory_allocated = memory_tracker.get_max_memory_allocated() + + LOGGER.info(f"\t+ Forward pass max memory used: {self.forward_max_memory_used} (MB)") + LOGGER.info(f"\t+ Forward pass max memory reserved: {self.forward_max_memory_reserved} (MB)") + LOGGER.info(f"\t+ Forward pass max memory allocated: {self.forward_max_memory_allocated} (MB)") + + def run_generate_latency_tracking(self, backend: "Backend") -> None: generate_input = self.input_generator.generate(mode="generate") LOGGER.info("\t+ Preparing input for the generation pass") @@ -130,44 +154,56 @@ def run_generate_tracking(self, backend: "Backend") -> None: with latency_tracker.track(): _ = backend.generate(generate_input, self.config.generate_kwargs) self.generate_latencies = latency_tracker.get_latencies() + LOGGER.info(f"\t+ Generation pass latency: {self.generate_latency:.2e} (s)") LOGGER.info(f"\t+ Generation pass throughput: {self.generate_throughput:.2f} (tokens/s)") - if self.config.memory: - LOGGER.info("\t+ Tracking generation pass peak memory") - memory_tracker = MemoryTracker(device=backend.device) - with memory_tracker.track(interval=self.generate_latency / 10): + def run_generate_energy_tracking(self, backend: "Backend") -> None: + generate_input = self.input_generator.generate(mode="generate") + + LOGGER.info("\t+ Preparing input for the generation pass") + generate_input = backend.prepare_input(generate_input) + + LOGGER.info("\t+ Tracking generation pass energy consumption") + num_generate_passes = 0 + energy_tracker = EnergyTracker() + with energy_tracker.track(interval=1, file_prefix="generate"): + while energy_tracker.get_elapsed_time() < self.config.duration: _ = backend.generate(generate_input, self.config.generate_kwargs) - self.generate_max_memory_used = memory_tracker.get_max_memory_used() - self.generate_max_memory_allocated = memory_tracker.get_max_memory_allocated() - self.generate_max_memory_reserved = memory_tracker.get_max_memory_reserved() - LOGGER.info(f"\t+ Generation pass max memory used: {self.generate_max_memory_used} (MB)") - LOGGER.info(f"\t+ Generation pass max memory allocated: {self.generate_max_memory_allocated} (MB)") - LOGGER.info(f"\t+ Generation pass max memory reserved: {self.generate_max_memory_reserved} (MB)") + num_generate_passes += 1 + num_generated_tokens = ( + num_generate_passes + * self.config.generate_kwargs["min_new_tokens"] + * self.config.input_shapes["batch_size"] + ) + self.generate_energy = extract_three_significant_digits( + energy_tracker.get_total_energy() / num_generated_tokens + ) + self.generate_emissions = extract_three_significant_digits( + energy_tracker.get_total_emissions() / num_generated_tokens + ) - if self.config.energy: - LOGGER.info("\t+ Tracking generation pass energy consumption") - num_generate_passes = 0 - energy_tracker = EnergyTracker() - with energy_tracker.track(interval=1, file_prefix="generate"): - while energy_tracker.get_elapsed_time() < self.config.duration: - _ = backend.generate(generate_input, self.config.generate_kwargs) - num_generate_passes += 1 - - num_generated_tokens = ( - num_generate_passes - * self.config.generate_kwargs["min_new_tokens"] - * self.config.input_shapes["batch_size"] - ) - self.generate_energy = extract_three_significant_digits( - energy_tracker.get_total_energy() / num_generated_tokens - ) - self.generate_emissions = extract_three_significant_digits( - energy_tracker.get_total_emissions() / num_generated_tokens - ) - LOGGER.info(f"\t+ Generation pass energy consumption: {self.generate_energy} (kWh/token)") - LOGGER.info(f"\t+ Generation pass carbon emissions: {self.generate_emissions} (kgCO2eq/token)") - LOGGER.info(f"\t+ Full details in the CodeCarbon report: {os.getcwd()}/generate_codecarbon.csv") + LOGGER.info(f"\t+ Generation pass energy consumption: {self.generate_energy} (kWh/token)") + LOGGER.info(f"\t+ Generation pass carbon emissions: {self.generate_emissions} (kgCO2eq/token)") + LOGGER.info(f"\t+ Full details in the CodeCarbon report: {os.getcwd()}/generate_codecarbon.csv") + + def run_generate_memory_tracking(self, backend: "Backend") -> None: + generate_input = self.input_generator.generate(mode="generate") + + LOGGER.info("\t+ Preparing input for the generation pass") + generate_input = backend.prepare_input(generate_input) + + LOGGER.info("\t+ Tracking generation pass peak memory") + memory_tracker = MemoryTracker(device=backend.device) + with memory_tracker.track(): + _ = backend.generate(generate_input, self.config.generate_kwargs) + self.generate_max_memory_used = memory_tracker.get_max_memory_used() + self.generate_max_memory_reserved = memory_tracker.get_max_memory_reserved() + self.generate_max_memory_allocated = memory_tracker.get_max_memory_allocated() + + LOGGER.info(f"\t+ Generation pass max memory used: {self.generate_max_memory_used} (MB)") + LOGGER.info(f"\t+ Generation pass max memory reserved: {self.generate_max_memory_reserved} (MB)") + LOGGER.info(f"\t+ Generation pass max memory allocated: {self.generate_max_memory_allocated} (MB)") # Metrics ## Forward pass metrics diff --git a/optimum_benchmark/trackers/latency.py b/optimum_benchmark/trackers/latency.py index 51f66840..d34be65f 100644 --- a/optimum_benchmark/trackers/latency.py +++ b/optimum_benchmark/trackers/latency.py @@ -1,4 +1,3 @@ -import os import time from contextlib import contextmanager from logging import getLogger diff --git a/optimum_benchmark/trackers/memory.py b/optimum_benchmark/trackers/memory.py index 9d97bc47..de6cd65f 100644 --- a/optimum_benchmark/trackers/memory.py +++ b/optimum_benchmark/trackers/memory.py @@ -18,39 +18,52 @@ def __init__(self, device: torch.device): self.device = device self.max_memory_used: int = 0 - self.max_memory_allocated: int = 0 self.max_memory_reserved: int = 0 + self.max_memory_allocated: int = 0 if self.device.type == "cuda": CUDA_VISIBLE_DEVICES = os.environ.get("CUDA_VISIBLE_DEVICES", None) if CUDA_VISIBLE_DEVICES is not None: # if CUDA_VISIBLE_DEVICES is set, only the visible devices' memory is tracked - self.device_ids = list(map(int, CUDA_VISIBLE_DEVICES.split(","))) + self.cuda_device_ids = list(map(int, CUDA_VISIBLE_DEVICES.split(","))) else: # if CUDA_VISIBLE_DEVICES is not set, only the main device's memory is tracked # which is 0 because otherwise, the experiment would've raised an error asking for # CUDA_VISIBLE_DEVICES to be set - self.device_ids = [self.device.index if self.device.index is not None else 0] - LOGGER.info(f"Tracking CUDA devices: {self.device_ids}") + self.cuda_device_ids = [self.device.index if self.device.index is not None else 0] + + self.pytorch_device_ids = list(range(len(self.cuda_device_ids))) + + LOGGER.info(f"Tracking CUDA devices: {self.cuda_device_ids}") + LOGGER.info(f"Tracking Pytorch CUDA devices: {self.pytorch_device_ids}") @contextmanager - def track(self, interval: float = 0.01): + def track(self): if self.device.type == "cuda": yield from self._cuda_memory() else: - yield from self._cpu_memory(interval) + yield from self._cpu_memory() def get_max_memory_used(self): return bytes_to_mega_bytes(self.max_memory_used) - def get_max_memory_allocated(self): - return bytes_to_mega_bytes(self.max_memory_allocated) - def get_max_memory_reserved(self): return bytes_to_mega_bytes(self.max_memory_reserved) + def get_max_memory_allocated(self): + return bytes_to_mega_bytes(self.max_memory_allocated) + def _cuda_memory(self): - for device_index in range(len(self.device_ids)): + # reset cache + torch.cuda.empty_cache() + + for device_index in self.pytorch_device_ids: + # reset accumulated stats + torch.cuda.reset_accumulated_memory_stats(device=device_index) + # reset max stats + torch.cuda.reset_max_memory_allocated(device=device_index) + torch.cuda.reset_max_memory_cached(device=device_index) + # reset peak stats torch.cuda.reset_peak_memory_stats(device=device_index) if is_nvidia_system(): @@ -63,7 +76,7 @@ def _cuda_memory(self): handles = [] nvml.nvmlInit() - for device_index in self.device_ids: + for device_index in self.cuda_device_ids: handle = nvml.nvmlDeviceGetHandleByIndex(device_index) handles.append(handle) @@ -73,6 +86,7 @@ def _cuda_memory(self): meminfo = nvml.nvmlDeviceGetMemoryInfo(handle) self.max_memory_used += meminfo.used + LOGGER.debug(f"PyNVML max memory used: {self.get_max_memory_used()} MB") nvml.nvmlShutdown() elif is_rocm_system(): if is_pyrsmi_available(): @@ -83,26 +97,25 @@ def _cuda_memory(self): ) rocml.smi_initialize() - print(rocml.smi_get_device_compute_process()) yield - for device_index in self.device_ids: + for device_index in self.cuda_device_ids: meminfo_used = rocml.smi_get_device_memory_used(device_index) self.max_memory_used += meminfo_used + LOGGER.debug(f"PyRSMI max memory used: {self.get_max_memory_used()} MB") rocml.smi_shutdown() else: raise ValueError("Could not measure GPU memory usage for a system different than NVIDIA or AMD RoCm.") - for device_index in range(len(self.device_ids)): + for device_index in self.pytorch_device_ids: self.max_memory_allocated += torch.cuda.max_memory_allocated(device=device_index) self.max_memory_reserved += torch.cuda.max_memory_reserved(device=device_index) - LOGGER.debug(f"Peak memory usage: {self.get_max_memory_used()} MB") LOGGER.debug(f"Pytorch max memory allocated: {self.get_max_memory_allocated()} MB") LOGGER.debug(f"Pytorch max memory reserved: {self.get_max_memory_reserved()} MB") - def _cpu_memory(self, interval: float): + def _cpu_memory(self, interval: float = 0.001): child_connection, parent_connection = Pipe() # instantiate process mem_process: Process = PeakMemoryMeasureProcess(os.getpid(), child_connection, interval)