diff --git a/.gitignore b/.gitignore index a589733f..30063812 100644 --- a/.gitignore +++ b/.gitignore @@ -168,6 +168,4 @@ data/ version.txt actions-runner/ -experiments/ -examples/ -results/ \ No newline at end of file +experiments/ \ No newline at end of file diff --git a/examples/running-llamas/README.md b/examples/running-llamas/README.md index dca82a4e..fbc5984d 100644 --- a/examples/running-llamas/README.md +++ b/examples/running-llamas/README.md @@ -7,7 +7,7 @@ A set of benchmarks on Meta's LLaMA2's inference. You will need to install these quantization packages: ```bash -pip install auto-gptq # or install it from source +pip install auto-gptq ``` ## Running @@ -15,8 +15,8 @@ pip install auto-gptq # or install it from source Then run these commands from this directory: ```bash -optimum-benchmark --config-dir configs/ --config-name _base_ --multirun -optimum-benchmark --config-dir configs/ --config-name gptq --multirun +optimum-benchmark --config-dir configs/ --config-name fp16 --multirun +optimum-benchmark --config-dir configs/ --config-name bnb-4bit --multirun ``` This will create a folder called `experiments` with the results of the benchmarks with an inference `batch_size` ranging from 1 to 16 and an input `sequence_length` (prompt size) of 256. diff --git a/examples/running-llamas/artifacts/A100-80GB/forward_latency_plot.png b/examples/running-llamas/artifacts/A100-80GB/forward_latency_plot.png deleted file mode 100644 index 4e7dd369..00000000 Binary files a/examples/running-llamas/artifacts/A100-80GB/forward_latency_plot.png and /dev/null differ diff --git a/examples/running-llamas/artifacts/A100-80GB/forward_memory_plot.png b/examples/running-llamas/artifacts/A100-80GB/forward_memory_plot.png deleted file mode 100644 index ff81266e..00000000 Binary files a/examples/running-llamas/artifacts/A100-80GB/forward_memory_plot.png and /dev/null differ diff --git a/examples/running-llamas/artifacts/A100-80GB/full_report.csv b/examples/running-llamas/artifacts/A100-80GB/full_report.csv deleted file mode 100644 index f05eb1e4..00000000 --- a/examples/running-llamas/artifacts/A100-80GB/full_report.csv +++ /dev/null @@ -1,11 +0,0 @@ -experiment_name,backend.name,backend.version,backend._target_,backend.seed,backend.inter_op_num_threads,backend.intra_op_num_threads,backend.initial_isolation_check,backend.continous_isolation_check,backend.delete_cache,backend.no_weights,backend.device_map,backend.torch_dtype,backend.disable_grad,backend.eval_mode,backend.amp_autocast,backend.amp_dtype,backend.torch_compile,backend.bettertransformer,backend.quantization_scheme,backend.use_ddp,backend.peft_strategy,benchmark.name,benchmark._target_,benchmark.duration,benchmark.warmup_runs,benchmark.memory,benchmark.energy,benchmark.input_shapes.batch_size,benchmark.input_shapes.sequence_length,benchmark.input_shapes.num_choices,benchmark.input_shapes.feature_size,benchmark.input_shapes.nb_max_frames,benchmark.input_shapes.audio_sequence_length,benchmark.new_tokens,benchmark.can_diffuse,benchmark.can_generate,benchmark.generate_kwargs.max_new_tokens,benchmark.generate_kwargs.min_new_tokens,benchmark.generate_kwargs.do_sample,benchmark.generate_kwargs.use_cache,benchmark.generate_kwargs.pad_token_id,benchmark.generate_kwargs.num_beams,model,device,task,hub_kwargs.revision,hub_kwargs.cache_dir,hub_kwargs.force_download,hub_kwargs.local_files_only,environment.optimum_version,environment.optimum_commit,environment.transformers_version,environment.transformers_commit,environment.accelerate_version,environment.accelerate_commit,environment.diffusers_version,environment.diffusers_commit,environment.python_version,environment.system,environment.cpu,environment.cpu_count,environment.cpu_ram_mb,environment.gpus,forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -fp16-batch_size(16)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,16,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda:0,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.402,39.8,19165,16520,17779,17.4,471.0,27988,26442,84511 -fp16-batch_size(8)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,8,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.204,39.2,17087,15037,15701,14.1,290.0,64889,19997,63503 -gptq-batch_size(16)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,16,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda:0,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.415,38.6,10900,7080,8604,24.6,333.0,65676,17002,83596 -fp16-batch_size(4)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,4,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.107,37.4,16022,14295,14636,13.9,147.0,26346,16774,24960 -gptq-batch_size(8)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,8,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.223,35.9,8826,5597,6530,19.9,206.0,56629,10557,54333 -fp16-batch_size(2)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,2,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.0579,34.5,15392,13924,14006,13.6,75.3,17003,15162,15617 -gptq-batch_size(4)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,4,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.122,32.8,7761,4855,5465,15.3,134.0,18085,7335,15789 -fp16-batch_size(1)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,1,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,meta-llama/Llama-2-7b-hf,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.0328,30.5,15153,13738,13767,13.5,37.9,15866,14356,14480 -gptq-batch_size(2)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,2,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.0706,28.3,6872,4484,4575,15.4,66.5,8822,5722,6526 -gptq-batch_size(1)-sequence_length(256)-new_tokens(512),pytorch,2.1.0+cu118,optimum_benchmark.backends.pytorch.backend.PyTorchBackend,42,,,False,False,False,False,,float16,True,True,False,,False,False,,False,,inference,optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark,10,10,True,False,1,256,1,80,3000,16000,512,False,True,512,512,False,True,0,1,TheBloke/Llama-2-7B-GPTQ,cuda,text-generation,main,,False,False,1.13.2,,4.34.1,,0.24.1,,,,3.10.12,Linux, AMD EPYC 7742 64-Core Processor,128,540684,"['NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB', 'NVIDIA A100-SXM4-80GB']",0.0458,21.8,6746,4298,4450,14.8,34.6,7606,4916,5309 diff --git a/examples/running-llamas/artifacts/A100-80GB/generate_memory_plot.png b/examples/running-llamas/artifacts/A100-80GB/generate_memory_plot.png deleted file mode 100644 index 8f5c9e34..00000000 Binary files a/examples/running-llamas/artifacts/A100-80GB/generate_memory_plot.png and /dev/null differ diff --git a/examples/running-llamas/artifacts/A100-80GB/generate_throughput_plot.png b/examples/running-llamas/artifacts/A100-80GB/generate_throughput_plot.png deleted file mode 100644 index dafd743e..00000000 Binary files a/examples/running-llamas/artifacts/A100-80GB/generate_throughput_plot.png and /dev/null differ diff --git a/examples/running-llamas/artifacts/A100-80GB/rich_table.svg b/examples/running-llamas/artifacts/A100-80GB/rich_table.svg deleted file mode 100644 index 09ae22bf..00000000 --- a/examples/running-llamas/artifacts/A100-80GB/rich_table.svg +++ /dev/null @@ -1,163 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Inference Report - - - - - - - - - - ┏━━━━━━━━━━┳━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━┓ - Forward Forward ForwardGenerateGenera…Generate -     Max     Max     Max     Max    Max     Max - Forward Forward  Memory  Memory  MemoryGenerate  Memory Memory  Memory -Experim…   Batch LatencyThrough…    UsedAllocat…ReservedThrough…    UsedAlloca…ReservedQuanti… -Name     GPU    Size     (s)(sample…    (MB)    (MB)    (MB)(tokens…    (MB)   (MB)    (MB) Scheme   Group -┡━━━━━━━━━━╇━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━┩ -fp16-ba…A100      164.02e-01   39.80   19165   16520   17779  471.00   27988  26442   84511   fp16A100-fp… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┤ -fp16-ba…A100       82.04e-01   39.20   17087   15037   15701  290.00   64889  19997   63503   fp16A100-fp… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┤ -gptq-ba…A100      164.15e-01   38.60   10900    7080    8604  333.00   65676  17002   83596   GPTQA100-GP… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┤ -fp16-ba…A100       41.07e-01   37.40   16022   14295   14636  147.00   26346  16774   24960   fp16A100-fp… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┤ -gptq-ba…A100       82.23e-01   35.90    8826    5597    6530  206.00   56629  10557   54333   GPTQA100-GP… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┤ -fp16-ba…A100       25.79e-02   34.50   15392   13924   14006   75.30   17003  15162   15617   fp16A100-fp… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┤ -gptq-ba…A100       41.22e-01   32.80    7761    4855    5465  134.00   18085   7335   15789   GPTQA100-GP… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┤ -fp16-ba…A100       13.28e-02   30.50   15153   13738   13767   37.90   15866  14356   14480   fp16A100-fp… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┤ -gptq-ba…A100       27.06e-02   28.30    6872    4484    4575   66.50    8822   5722    6526   GPTQA100-GP… -├──────────┼──────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼──────────┼─────────┼──────────┼─────────┼──────────┤ -gptq-ba…A100       14.58e-02   21.80    6746    4298    4450   34.60    7606   4916    5309   GPTQA100-GP… -└──────────┴──────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴──────────┴─────────┴──────────┴─────────┴──────────┘ - - - - diff --git a/examples/running-llamas/artifacts/A100-80GB/short_report.csv b/examples/running-llamas/artifacts/A100-80GB/short_report.csv deleted file mode 100644 index 6d6dba2d..00000000 --- a/examples/running-llamas/artifacts/A100-80GB/short_report.csv +++ /dev/null @@ -1,11 +0,0 @@ -experiment_name,GPU,Batch Size,Forward Latency (s),Forward Throughput (samples/s),Forward Max Memory Used (MB),Forward Max Memory Allocated (MB),Forward Max Memory Reserved (MB),Generate Throughput (tokens/s),Generate Max Memory Used (MB),Generate Max Memory Allocated (MB),Generate Max Memory Reserved (MB),Quantization Scheme,Group -fp16-batch_size(16)-sequence_length(256)-new_tokens(512),A100,16,0.402,39.8,19165,16520,17779,471.0,27988,26442,84511,fp16,A100-fp16 -fp16-batch_size(8)-sequence_length(256)-new_tokens(512),A100,8,0.204,39.2,17087,15037,15701,290.0,64889,19997,63503,fp16,A100-fp16 -gptq-batch_size(16)-sequence_length(256)-new_tokens(512),A100,16,0.415,38.6,10900,7080,8604,333.0,65676,17002,83596,GPTQ,A100-GPTQ -fp16-batch_size(4)-sequence_length(256)-new_tokens(512),A100,4,0.107,37.4,16022,14295,14636,147.0,26346,16774,24960,fp16,A100-fp16 -gptq-batch_size(8)-sequence_length(256)-new_tokens(512),A100,8,0.223,35.9,8826,5597,6530,206.0,56629,10557,54333,GPTQ,A100-GPTQ -fp16-batch_size(2)-sequence_length(256)-new_tokens(512),A100,2,0.0579,34.5,15392,13924,14006,75.3,17003,15162,15617,fp16,A100-fp16 -gptq-batch_size(4)-sequence_length(256)-new_tokens(512),A100,4,0.122,32.8,7761,4855,5465,134.0,18085,7335,15789,GPTQ,A100-GPTQ -fp16-batch_size(1)-sequence_length(256)-new_tokens(512),A100,1,0.0328,30.5,15153,13738,13767,37.9,15866,14356,14480,fp16,A100-fp16 -gptq-batch_size(2)-sequence_length(256)-new_tokens(512),A100,2,0.0706,28.3,6872,4484,4575,66.5,8822,5722,6526,GPTQ,A100-GPTQ -gptq-batch_size(1)-sequence_length(256)-new_tokens(512),A100,1,0.0458,21.8,6746,4298,4450,34.6,7606,4916,5309,GPTQ,A100-GPTQ diff --git a/examples/running-llamas/configs/bnb.yaml b/examples/running-llamas/configs/bnb.yaml deleted file mode 100644 index 61cf1ebd..00000000 --- a/examples/running-llamas/configs/bnb.yaml +++ /dev/null @@ -1,11 +0,0 @@ -defaults: - - _base_ - - _self_ - -experiment_name: bnb-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) - -backend: - quantization_scheme: bnb - quantization_config: - load_in_4bit: true - bnb_4bit_compute_dtype: float16 diff --git a/examples/running-llamas/configs/gptq.yaml b/examples/running-llamas/configs/gptq.yaml deleted file mode 100644 index 3f15bdd5..00000000 --- a/examples/running-llamas/configs/gptq.yaml +++ /dev/null @@ -1,6 +0,0 @@ -defaults: - - _base_ - - _self_ - -experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: TheBloke/Llama-2-7B-GPTQ diff --git a/examples/running-llamas/configs/_base_.yaml b/examples/running-llamas/configs/single_device/_base_.yaml similarity index 55% rename from examples/running-llamas/configs/_base_.yaml rename to examples/running-llamas/configs/single_device/_base_.yaml index 23a3eaa4..ce06aa17 100644 --- a/examples/running-llamas/configs/_base_.yaml +++ b/examples/running-llamas/configs/single_device/_base_.yaml @@ -1,5 +1,6 @@ defaults: - backend: pytorch # default backend + - launcher: process # isolated launcher - benchmark: inference # default benchmark - experiment # inheriting from experiment config - _self_ # for hydra 1.1 compatibility @@ -8,31 +9,31 @@ defaults: hydra: run: - dir: experiments/${experiment_name} + dir: experiments/${oc.env:HOSTNAME}/single_device/${model}/${experiment_name} sweep: - dir: experiments/${experiment_name} + dir: experiments/${oc.env:HOSTNAME}/single_device/${model}/${experiment_name} job: chdir: true env_set: - CUDA_VISIBLE_DEVICES: 0 + CUDA_VISIBLE_DEVICES: 2 CUDA_DEVICE_ORDER: PCI_BUS_ID sweeper: params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 + benchmark.input_shapes.batch_size: 1,2,4,8,16,32,64,128 -experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: meta-llama/Llama-2-7b-hf -device: cuda:0 +experiment_name: ??? +device: cuda +model: ??? backend: - initial_isolation_check: false - continous_isolation_check: false + continuous_isolation: true + isolation_check_interval: 0.1 torch_dtype: float16 + no_weights: true benchmark: memory: true warmup_runs: 10 - new_tokens: 512 input_shapes: sequence_length: 256 diff --git a/examples/running-llamas/configs/single_device/fp16.yaml b/examples/running-llamas/configs/single_device/fp16.yaml new file mode 100644 index 00000000..c3c39f6a --- /dev/null +++ b/examples/running-llamas/configs/single_device/fp16.yaml @@ -0,0 +1,11 @@ +defaults: + - _base_ + - _self_ + +hydra: + sweeper: + params: + model: NousResearch/Llama-2-7b-hf,NousResearch/Llama-2-13b-hf,NousResearch/Llama-2-70b-hf + +experiment_name: fp16 +model: llama diff --git a/examples/running-llamas/configs/single_device/gptq-4bit.yaml b/examples/running-llamas/configs/single_device/gptq-4bit.yaml new file mode 100644 index 00000000..1c95390c --- /dev/null +++ b/examples/running-llamas/configs/single_device/gptq-4bit.yaml @@ -0,0 +1,6 @@ +defaults: + - _base_ + - _self_ + +experiment_name: gptq-4bit +model: TheBloke/Llama-2-7B-GPTQ diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml deleted file mode 100644 index d2cd4143..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 1 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: meta-llama/Llama-2-7b-hf -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml deleted file mode 100644 index 41af721d..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml +++ /dev/null @@ -1,174 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: - - benchmark.input_shapes.batch_size=1 - job: - name: experiment - chdir: true - override_dirname: benchmark.input_shapes.batch_size=1 - id: '0' - num: 0 - config_name: _base_ - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: /workspace/optimum-benchmark/examples/running-llamas/experiments/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0 - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/overrides.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/overrides.yaml deleted file mode 100644 index 989520ff..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/overrides.yaml +++ /dev/null @@ -1 +0,0 @@ -- benchmark.input_shapes.batch_size=1 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml deleted file mode 100644 index 6d8478b5..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -backend: - name: pytorch - version: 2.1.0+cu118 - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: true - eval_mode: true - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 1 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: false - can_generate: true - forward_kwargs: {} - generate_kwargs: - max_new_tokens: 512 - min_new_tokens: 512 - do_sample: false - use_cache: true - pad_token_id: 0 - num_beams: 1 -experiment_name: fp16-batch_size(1)-sequence_length(256)-new_tokens(512) -model: meta-llama/Llama-2-7b-hf -device: cuda -task: text-generation -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv deleted file mode 100644 index 8161550f..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.0328,30.5,15153,13738,13767,13.5,37.9,15866,14356,14480 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml deleted file mode 100644 index 88fed34b..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 16 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: meta-llama/Llama-2-7b-hf -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml deleted file mode 100644 index 1b047280..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml +++ /dev/null @@ -1,174 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: - - benchmark.input_shapes.batch_size=16 - job: - name: experiment - chdir: true - override_dirname: benchmark.input_shapes.batch_size=16 - id: '4' - num: 4 - config_name: _base_ - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: /workspace/optimum-benchmark/examples/running-llamas/experiments/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4 - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/overrides.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/overrides.yaml deleted file mode 100644 index fdb7f01d..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/overrides.yaml +++ /dev/null @@ -1 +0,0 @@ -- benchmark.input_shapes.batch_size=16 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml deleted file mode 100644 index 64882a0b..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -backend: - name: pytorch - version: 2.1.0+cu118 - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: true - eval_mode: true - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 16 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: false - can_generate: true - forward_kwargs: {} - generate_kwargs: - max_new_tokens: 512 - min_new_tokens: 512 - do_sample: false - use_cache: true - pad_token_id: 0 - num_beams: 1 -experiment_name: fp16-batch_size(16)-sequence_length(256)-new_tokens(512) -model: meta-llama/Llama-2-7b-hf -device: cuda:0 -task: text-generation -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv deleted file mode 100644 index bad3d7ca..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.402,39.8,19165,16520,17779,17.4,471.0,27988,26442,84511 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml deleted file mode 100644 index f4197a4f..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 2 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: meta-llama/Llama-2-7b-hf -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml deleted file mode 100644 index 7ef5862d..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml +++ /dev/null @@ -1,174 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: - - benchmark.input_shapes.batch_size=2 - job: - name: experiment - chdir: true - override_dirname: benchmark.input_shapes.batch_size=2 - id: '1' - num: 1 - config_name: _base_ - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: /workspace/optimum-benchmark/examples/running-llamas/experiments/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1 - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/overrides.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/overrides.yaml deleted file mode 100644 index 8211b85f..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/overrides.yaml +++ /dev/null @@ -1 +0,0 @@ -- benchmark.input_shapes.batch_size=2 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml deleted file mode 100644 index 1de85dc8..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -backend: - name: pytorch - version: 2.1.0+cu118 - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: true - eval_mode: true - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 2 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: false - can_generate: true - forward_kwargs: {} - generate_kwargs: - max_new_tokens: 512 - min_new_tokens: 512 - do_sample: false - use_cache: true - pad_token_id: 0 - num_beams: 1 -experiment_name: fp16-batch_size(2)-sequence_length(256)-new_tokens(512) -model: meta-llama/Llama-2-7b-hf -device: cuda -task: text-generation -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv deleted file mode 100644 index 42a8985a..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.0579,34.5,15392,13924,14006,13.6,75.3,17003,15162,15617 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml deleted file mode 100644 index 6de5aaf7..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml +++ /dev/null @@ -1,246 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: [] - job: - name: experiment - chdir: true - override_dirname: '' - id: ??? - num: ??? - config_name: _base_ - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: ??? - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 2 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: meta-llama/Llama-2-7b-hf -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml deleted file mode 100644 index 589d9f13..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 4 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: meta-llama/Llama-2-7b-hf -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml deleted file mode 100644 index 8e3abfc7..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml +++ /dev/null @@ -1,174 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: - - benchmark.input_shapes.batch_size=4 - job: - name: experiment - chdir: true - override_dirname: benchmark.input_shapes.batch_size=4 - id: '2' - num: 2 - config_name: _base_ - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: /workspace/optimum-benchmark/examples/running-llamas/experiments/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2 - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/overrides.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/overrides.yaml deleted file mode 100644 index eef8c9ca..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/overrides.yaml +++ /dev/null @@ -1 +0,0 @@ -- benchmark.input_shapes.batch_size=4 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml deleted file mode 100644 index eb07905d..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -backend: - name: pytorch - version: 2.1.0+cu118 - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: true - eval_mode: true - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 4 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: false - can_generate: true - forward_kwargs: {} - generate_kwargs: - max_new_tokens: 512 - min_new_tokens: 512 - do_sample: false - use_cache: true - pad_token_id: 0 - num_beams: 1 -experiment_name: fp16-batch_size(4)-sequence_length(256)-new_tokens(512) -model: meta-llama/Llama-2-7b-hf -device: cuda -task: text-generation -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv deleted file mode 100644 index 3a40d093..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.107,37.4,16022,14295,14636,13.9,147.0,26346,16774,24960 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml deleted file mode 100644 index ff7402ca..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 8 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: fp16-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: meta-llama/Llama-2-7b-hf -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml deleted file mode 100644 index ed654ac1..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml +++ /dev/null @@ -1,174 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: - - benchmark.input_shapes.batch_size=8 - job: - name: experiment - chdir: true - override_dirname: benchmark.input_shapes.batch_size=8 - id: '3' - num: 3 - config_name: _base_ - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: /workspace/optimum-benchmark/examples/running-llamas/experiments/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3 - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/overrides.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/overrides.yaml deleted file mode 100644 index 8cd14374..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/overrides.yaml +++ /dev/null @@ -1 +0,0 @@ -- benchmark.input_shapes.batch_size=8 diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml deleted file mode 100644 index a180796d..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -backend: - name: pytorch - version: 2.1.0+cu118 - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: true - eval_mode: true - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 8 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: false - can_generate: true - forward_kwargs: {} - generate_kwargs: - max_new_tokens: 512 - min_new_tokens: 512 - do_sample: false - use_cache: true - pad_token_id: 0 - num_beams: 1 -experiment_name: fp16-batch_size(8)-sequence_length(256)-new_tokens(512) -model: meta-llama/Llama-2-7b-hf -device: cuda -task: text-generation -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv deleted file mode 100644 index 7fdc7a5f..00000000 --- a/examples/running-llamas/experiments/A100-80GB/fp16-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.204,39.2,17087,15037,15701,14.1,290.0,64889,19997,63503 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml deleted file mode 100644 index 03090426..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 1 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml deleted file mode 100644 index 65926c1f..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/hydra.yaml +++ /dev/null @@ -1,174 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: - - benchmark.input_shapes.batch_size=1 - job: - name: experiment - chdir: true - override_dirname: benchmark.input_shapes.batch_size=1 - id: '0' - num: 0 - config_name: gptq - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: /workspace/optimum-benchmark/examples/running-llamas/experiments/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0 - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/overrides.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/overrides.yaml deleted file mode 100644 index 989520ff..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/.hydra/overrides.yaml +++ /dev/null @@ -1 +0,0 @@ -- benchmark.input_shapes.batch_size=1 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml deleted file mode 100644 index 256e0e54..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/hydra_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -backend: - name: pytorch - version: 2.1.0+cu118 - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: true - eval_mode: true - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 1 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: false - can_generate: true - forward_kwargs: {} - generate_kwargs: - max_new_tokens: 512 - min_new_tokens: 512 - do_sample: false - use_cache: true - pad_token_id: 0 - num_beams: 1 -experiment_name: gptq-batch_size(1)-sequence_length(256)-new_tokens(512) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda -task: text-generation -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv deleted file mode 100644 index da04a0c3..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(1)-sequence_length(256)-new_tokens(512)/0/inference_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.0458,21.8,6746,4298,4450,14.8,34.6,7606,4916,5309 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml deleted file mode 100644 index 58457ece..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 16 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml deleted file mode 100644 index c9e56066..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/hydra.yaml +++ /dev/null @@ -1,174 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: - - benchmark.input_shapes.batch_size=16 - job: - name: experiment - chdir: true - override_dirname: benchmark.input_shapes.batch_size=16 - id: '4' - num: 4 - config_name: gptq - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: /workspace/optimum-benchmark/examples/running-llamas/experiments/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4 - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/overrides.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/overrides.yaml deleted file mode 100644 index fdb7f01d..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/.hydra/overrides.yaml +++ /dev/null @@ -1 +0,0 @@ -- benchmark.input_shapes.batch_size=16 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml deleted file mode 100644 index ad91270b..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/hydra_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -backend: - name: pytorch - version: 2.1.0+cu118 - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: true - eval_mode: true - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 16 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: false - can_generate: true - forward_kwargs: {} - generate_kwargs: - max_new_tokens: 512 - min_new_tokens: 512 - do_sample: false - use_cache: true - pad_token_id: 0 - num_beams: 1 -experiment_name: gptq-batch_size(16)-sequence_length(256)-new_tokens(512) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda:0 -task: text-generation -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv deleted file mode 100644 index 0b114e75..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(16)-sequence_length(256)-new_tokens(512)/4/inference_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.415,38.6,10900,7080,8604,24.6,333.0,65676,17002,83596 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml deleted file mode 100644 index 90ddbb6e..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 2 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml deleted file mode 100644 index 77d220f4..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/hydra.yaml +++ /dev/null @@ -1,174 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: - - benchmark.input_shapes.batch_size=2 - job: - name: experiment - chdir: true - override_dirname: benchmark.input_shapes.batch_size=2 - id: '1' - num: 1 - config_name: gptq - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: /workspace/optimum-benchmark/examples/running-llamas/experiments/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1 - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/overrides.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/overrides.yaml deleted file mode 100644 index 8211b85f..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/.hydra/overrides.yaml +++ /dev/null @@ -1 +0,0 @@ -- benchmark.input_shapes.batch_size=2 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml deleted file mode 100644 index d946b75c..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/hydra_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -backend: - name: pytorch - version: 2.1.0+cu118 - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: true - eval_mode: true - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 2 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: false - can_generate: true - forward_kwargs: {} - generate_kwargs: - max_new_tokens: 512 - min_new_tokens: 512 - do_sample: false - use_cache: true - pad_token_id: 0 - num_beams: 1 -experiment_name: gptq-batch_size(2)-sequence_length(256)-new_tokens(512) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda -task: text-generation -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv deleted file mode 100644 index 1797a299..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/1/inference_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.0706,28.3,6872,4484,4575,15.4,66.5,8822,5722,6526 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml deleted file mode 100644 index 667d738e..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(2)-sequence_length(256)-new_tokens(512)/multirun.yaml +++ /dev/null @@ -1,246 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: [] - job: - name: experiment - chdir: true - override_dirname: '' - id: ??? - num: ??? - config_name: gptq - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: ??? - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 2 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml deleted file mode 100644 index 97f81007..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 4 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml deleted file mode 100644 index d5efb914..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/hydra.yaml +++ /dev/null @@ -1,174 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: - - benchmark.input_shapes.batch_size=4 - job: - name: experiment - chdir: true - override_dirname: benchmark.input_shapes.batch_size=4 - id: '2' - num: 2 - config_name: gptq - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: /workspace/optimum-benchmark/examples/running-llamas/experiments/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2 - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/overrides.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/overrides.yaml deleted file mode 100644 index eef8c9ca..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/.hydra/overrides.yaml +++ /dev/null @@ -1 +0,0 @@ -- benchmark.input_shapes.batch_size=4 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml deleted file mode 100644 index eb2c9cbd..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/hydra_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -backend: - name: pytorch - version: 2.1.0+cu118 - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: true - eval_mode: true - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 4 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: false - can_generate: true - forward_kwargs: {} - generate_kwargs: - max_new_tokens: 512 - min_new_tokens: 512 - do_sample: false - use_cache: true - pad_token_id: 0 - num_beams: 1 -experiment_name: gptq-batch_size(4)-sequence_length(256)-new_tokens(512) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda -task: text-generation -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv deleted file mode 100644 index e54a036c..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(4)-sequence_length(256)-new_tokens(512)/2/inference_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.122,32.8,7761,4855,5465,15.3,134.0,18085,7335,15789 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml deleted file mode 100644 index 3a4085f7..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/config.yaml +++ /dev/null @@ -1,73 +0,0 @@ -backend: - name: pytorch - version: ${pytorch_version:} - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: ${is_inference:${benchmark.name}} - eval_mode: ${is_inference:${benchmark.name}} - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 8 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: ${can_diffuse:${task}} - can_generate: ${can_generate:${task}} - forward_kwargs: {} - generate_kwargs: {} -experiment_name: gptq-batch_size(${benchmark.input_shapes.batch_size})-sequence_length(${benchmark.input_shapes.sequence_length})-new_tokens(${benchmark.new_tokens}) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda:0 -task: ${infer_task:${model}} -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml deleted file mode 100644 index 5f8fa193..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/hydra.yaml +++ /dev/null @@ -1,174 +0,0 @@ -hydra: - run: - dir: experiments/${experiment_name} - sweep: - dir: experiments/${experiment_name} - subdir: ${hydra.job.num} - launcher: - _target_: hydra._internal.core_plugins.basic_launcher.BasicLauncher - sweeper: - _target_: hydra._internal.core_plugins.basic_sweeper.BasicSweeper - max_batch_size: null - params: - benchmark.input_shapes.batch_size: 1,2,4,8,16 - help: - app_name: ${hydra.job.name} - header: '${hydra.help.app_name} is powered by Hydra. - - ' - footer: 'Powered by Hydra (https://hydra.cc) - - Use --hydra-help to view Hydra specific help - - ' - template: '${hydra.help.header} - - == Configuration groups == - - Compose your configuration from those groups (group=option) - - - $APP_CONFIG_GROUPS - - - == Config == - - Override anything in the config (foo.bar=value) - - - $CONFIG - - - ${hydra.help.footer} - - ' - hydra_help: - template: 'Hydra (${hydra.runtime.version}) - - See https://hydra.cc for more info. - - - == Flags == - - $FLAGS_HELP - - - == Configuration groups == - - Compose your configuration from those groups (For example, append hydra/job_logging=disabled - to command line) - - - $HYDRA_CONFIG_GROUPS - - - Use ''--cfg hydra'' to Show the Hydra config. - - ' - hydra_help: ??? - hydra_logging: - version: 1 - formatters: - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(purple)sHYDRA%(reset)s] %(message)s' - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - root: - level: INFO - handlers: - - console - disable_existing_loggers: false - job_logging: - version: 1 - formatters: - simple: - format: '[%(asctime)s][%(name)s][%(levelname)s] - %(message)s' - colorlog: - (): colorlog.ColoredFormatter - format: '[%(cyan)s%(asctime)s%(reset)s][%(blue)s%(name)s%(reset)s][%(log_color)s%(levelname)s%(reset)s] - - %(message)s' - log_colors: - DEBUG: purple - INFO: green - WARNING: yellow - ERROR: red - CRITICAL: red - handlers: - console: - class: logging.StreamHandler - formatter: colorlog - stream: ext://sys.stdout - file: - class: logging.FileHandler - formatter: simple - filename: ${hydra.job.name}.log - root: - level: INFO - handlers: - - console - - file - disable_existing_loggers: false - env: {} - mode: MULTIRUN - searchpath: [] - callbacks: {} - output_subdir: .hydra - overrides: - hydra: - - hydra.mode=MULTIRUN - task: - - benchmark.input_shapes.batch_size=8 - job: - name: experiment - chdir: true - override_dirname: benchmark.input_shapes.batch_size=8 - id: '3' - num: 3 - config_name: gptq - env_set: - CUDA_VISIBLE_DEVICES: '3' - CUDA_DEVICE_ORDER: PCI_BUS_ID - env_copy: [] - config: - override_dirname: - kv_sep: '=' - item_sep: ',' - exclude_keys: [] - runtime: - version: 1.3.2 - version_base: '1.3' - cwd: /workspace/optimum-benchmark/examples/running-llamas - config_sources: - - path: hydra.conf - schema: pkg - provider: hydra - - path: optimum_benchmark - schema: pkg - provider: main - - path: hydra_plugins.hydra_colorlog.conf - schema: pkg - provider: hydra-colorlog - - path: /workspace/optimum-benchmark/examples/running-llamas/configs - schema: file - provider: command-line - - path: '' - schema: structured - provider: schema - output_dir: /workspace/optimum-benchmark/examples/running-llamas/experiments/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3 - choices: - benchmark: inference - backend: pytorch - hydra/env: default - hydra/callbacks: null - hydra/job_logging: colorlog - hydra/hydra_logging: colorlog - hydra/hydra_help: default - hydra/help: default - hydra/sweeper: basic - hydra/launcher: basic - hydra/output: default - verbose: false diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/overrides.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/overrides.yaml deleted file mode 100644 index 8cd14374..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/.hydra/overrides.yaml +++ /dev/null @@ -1 +0,0 @@ -- benchmark.input_shapes.batch_size=8 diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml deleted file mode 100644 index 06e20479..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/hydra_config.yaml +++ /dev/null @@ -1,79 +0,0 @@ -backend: - name: pytorch - version: 2.1.0+cu118 - _target_: optimum_benchmark.backends.pytorch.backend.PyTorchBackend - seed: 42 - inter_op_num_threads: null - intra_op_num_threads: null - initial_isolation_check: false - continous_isolation_check: false - delete_cache: false - no_weights: false - device_map: null - torch_dtype: float16 - disable_grad: true - eval_mode: true - amp_autocast: false - amp_dtype: null - torch_compile: false - torch_compile_config: {} - bettertransformer: false - quantization_scheme: null - quantization_config: {} - use_ddp: false - ddp_config: {} - peft_strategy: null - peft_config: {} -benchmark: - name: inference - _target_: optimum_benchmark.benchmarks.inference.benchmark.InferenceBenchmark - duration: 10 - warmup_runs: 10 - memory: true - energy: false - input_shapes: - batch_size: 8 - sequence_length: 256 - num_choices: 1 - feature_size: 80 - nb_max_frames: 3000 - audio_sequence_length: 16000 - new_tokens: 512 - can_diffuse: false - can_generate: true - forward_kwargs: {} - generate_kwargs: - max_new_tokens: 512 - min_new_tokens: 512 - do_sample: false - use_cache: true - pad_token_id: 0 - num_beams: 1 -experiment_name: gptq-batch_size(8)-sequence_length(256)-new_tokens(512) -model: TheBloke/Llama-2-7B-GPTQ -device: cuda -task: text-generation -hub_kwargs: - revision: main - cache_dir: null - force_download: false - local_files_only: false -environment: - optimum_version: 1.13.2 - optimum_commit: null - transformers_version: 4.34.1 - transformers_commit: null - accelerate_version: 0.24.1 - accelerate_commit: null - diffusers_version: null - diffusers_commit: null - python_version: 3.10.12 - system: Linux - cpu: ' AMD EPYC 7742 64-Core Processor' - cpu_count: 128 - cpu_ram_mb: 540684 - gpus: - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB - - NVIDIA A100-SXM4-80GB diff --git a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv b/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv deleted file mode 100644 index 5c52e357..00000000 --- a/examples/running-llamas/experiments/A100-80GB/gptq-batch_size(8)-sequence_length(256)-new_tokens(512)/3/inference_results.csv +++ /dev/null @@ -1,2 +0,0 @@ -forward.latency(s),forward.throughput(samples/s),forward.max_memory_used(MB),forward.max_memory_allocated(MB),forward.max_memory_reserved(MB),generate.latency(s),generate.throughput(tokens/s),generate.max_memory_used(MB),generate.max_memory_allocated(MB),generate.max_memory_reserved(MB) -0.223,35.9,8826,5597,6530,19.9,206.0,56629,10557,54333