update reporting tool to target a memory type

huggingface · Oct 31, 2023 · 96ae22a · 96ae22a
1 parent f2b3f8a
commit 96ae22a
Show file tree

Hide file tree

Showing 6 changed files with 187 additions and 166 deletions.
diff --git a/examples/running-llamas/README.md b/examples/running-llamas/README.md
@@ -26,11 +26,16 @@ This will create a folder called `experiments` with the results of the benchmark
 To create a report run:
 
 ```bash
-python report.py -e experiments
+python report.py -e experiments -m allocated
 ```
 
 Which will create some quick reporting artifacts like a `full_report.csv`, `short_report.csv`, some plots and a `rich_table.svg`.
 
+`-e` is the experiments folder from which to read the results.
+`-r` is the report folder to which to write the resulting artifacts.
+`-m` is the memory type to use for the reporting. It can be `used`, `allocated` or `reserved`.
+
+
 ## Results
 
 ### On A100-80GB

diff --git a/examples/running-llamas/artifacts/A100-80GB/forward_memory_plot.png b/examples/running-llamas/artifacts/A100-80GB/forward_memory_plot.png
diff --git a/examples/running-llamas/artifacts/A100-80GB/generate_memory_plot.png b/examples/running-llamas/artifacts/A100-80GB/generate_memory_plot.png
diff --git a/examples/running-llamas/artifacts/A100-80GB/rich_table.svg b/examples/running-llamas/artifacts/A100-80GB/rich_table.svg
diff --git a/examples/running-llamas/artifacts/A100-80GB/short_report.csv b/examples/running-llamas/artifacts/A100-80GB/short_report.csv
@@ -1,11 +1,11 @@
-experiment_name,GPU,Batch Size,Forward Latency (s),Forward Throughput (samples/s),Forward Max Memory Used (MB),Forward Max Memory Allocated (MB),Forward Max Memory Reserved (MB),Generate Throughput (tokens/s),Generate Max Memory Used (MB),Generate Max Memory Allocated (MB),Generate Max Memory Reserved (MB),Quantization Scheme
-fp16-batch_size(16)-sequence_length(256)-new_tokens(512),A100,16,0.402,39.8,19165,16520,17779,471.0,27988,26442,84511,fp16
-fp16-batch_size(8)-sequence_length(256)-new_tokens(512),A100,8,0.204,39.2,17087,15037,15701,290.0,64889,19997,63503,fp16
-gptq-batch_size(16)-sequence_length(256)-new_tokens(512),A100,16,0.415,38.6,10900,7080,8604,333.0,65676,17002,83596,GPTQ
-fp16-batch_size(4)-sequence_length(256)-new_tokens(512),A100,4,0.107,37.4,16022,14295,14636,147.0,26346,16774,24960,fp16
-gptq-batch_size(8)-sequence_length(256)-new_tokens(512),A100,8,0.223,35.9,8826,5597,6530,206.0,56629,10557,54333,GPTQ
-fp16-batch_size(2)-sequence_length(256)-new_tokens(512),A100,2,0.0579,34.5,15392,13924,14006,75.3,17003,15162,15617,fp16
-gptq-batch_size(4)-sequence_length(256)-new_tokens(512),A100,4,0.122,32.8,7761,4855,5465,134.0,18085,7335,15789,GPTQ
-fp16-batch_size(1)-sequence_length(256)-new_tokens(512),A100,1,0.0328,30.5,15153,13738,13767,37.9,15866,14356,14480,fp16
-gptq-batch_size(2)-sequence_length(256)-new_tokens(512),A100,2,0.0706,28.3,6872,4484,4575,66.5,8822,5722,6526,GPTQ
-gptq-batch_size(1)-sequence_length(256)-new_tokens(512),A100,1,0.0458,21.8,6746,4298,4450,34.6,7606,4916,5309,GPTQ
+experiment_name,GPU,Batch Size,Forward Latency (s),Forward Throughput (samples/s),Forward Max Memory Used (MB),Forward Max Memory Allocated (MB),Forward Max Memory Reserved (MB),Generate Throughput (tokens/s),Generate Max Memory Used (MB),Generate Max Memory Allocated (MB),Generate Max Memory Reserved (MB),Quantization Scheme,Group
+fp16-batch_size(16)-sequence_length(256)-new_tokens(512),A100,16,0.402,39.8,19165,16520,17779,471.0,27988,26442,84511,fp16,A100-fp16
+fp16-batch_size(8)-sequence_length(256)-new_tokens(512),A100,8,0.204,39.2,17087,15037,15701,290.0,64889,19997,63503,fp16,A100-fp16
+gptq-batch_size(16)-sequence_length(256)-new_tokens(512),A100,16,0.415,38.6,10900,7080,8604,333.0,65676,17002,83596,GPTQ,A100-GPTQ
+fp16-batch_size(4)-sequence_length(256)-new_tokens(512),A100,4,0.107,37.4,16022,14295,14636,147.0,26346,16774,24960,fp16,A100-fp16
+gptq-batch_size(8)-sequence_length(256)-new_tokens(512),A100,8,0.223,35.9,8826,5597,6530,206.0,56629,10557,54333,GPTQ,A100-GPTQ
+fp16-batch_size(2)-sequence_length(256)-new_tokens(512),A100,2,0.0579,34.5,15392,13924,14006,75.3,17003,15162,15617,fp16,A100-fp16
+gptq-batch_size(4)-sequence_length(256)-new_tokens(512),A100,4,0.122,32.8,7761,4855,5465,134.0,18085,7335,15789,GPTQ,A100-GPTQ
+fp16-batch_size(1)-sequence_length(256)-new_tokens(512),A100,1,0.0328,30.5,15153,13738,13767,37.9,15866,14356,14480,fp16,A100-fp16
+gptq-batch_size(2)-sequence_length(256)-new_tokens(512),A100,2,0.0706,28.3,6872,4484,4575,66.5,8822,5722,6526,GPTQ,A100-GPTQ
+gptq-batch_size(1)-sequence_length(256)-new_tokens(512),A100,1,0.0458,21.8,6746,4298,4450,34.6,7606,4916,5309,GPTQ,A100-GPTQ
diff --git a/examples/running-llamas/report.py b/examples/running-llamas/report.py
@@ -11,7 +11,7 @@
 from rich.terminal_theme import MONOKAI
 
 
-def gather_inference_report(root_folder: Path) -> DataFrame:
+def gather_full_report(root_folder: Path, report_folder: str = "artifacts") -> DataFrame:
     # key is path to inference file as string, value is dataframe
     inference_dfs = {
         f.parent.absolute().as_posix(): pd.read_csv(f) for f in root_folder.glob("**/inference_results.csv")
@@ -37,6 +37,10 @@ def gather_inference_report(root_folder: Path) -> DataFrame:
     # Concatenate all reports
     inference_report = pd.concat(inference_reports, axis=0, ignore_index=True)
     inference_report.set_index("experiment_name", inplace=True)
+
+    inference_report.sort_values(by="forward.throughput(samples/s)", ascending=False, inplace=True)
+    inference_report.to_csv(f"{report_folder}/full_report.csv")
+
     return inference_report
 
 
@@ -77,7 +81,7 @@ def format_row(row, style=""):
     return formated_row
 
 
-def get_short_report(inference_report):
+def get_short_report(full_report, report_folder: str = "artifacts"):
     short_columns = {
         "environment.gpus": "GPU",
         "benchmark.input_shapes.batch_size": "Batch Size",
@@ -91,8 +95,8 @@ def get_short_report(inference_report):
         "generate.max_memory_allocated(MB)": "Generate Max Memory Allocated (MB)",
         "generate.max_memory_reserved(MB)": "Generate Max Memory Reserved (MB)",
     }
-    short_report = inference_report[list(short_columns.keys())].rename(columns=short_columns)
-    short_report["Quantization Scheme"] = inference_report.index.str.split("-").str[0]
+    short_report = full_report[list(short_columns.keys())].rename(columns=short_columns)
+    short_report["Quantization Scheme"] = full_report.index.str.split("-").str[0]
     short_report["Quantization Scheme"].fillna("unquantized", inplace=True)
     short_report["Quantization Scheme"].replace("bnb", "BnB", inplace=True)
     short_report["Quantization Scheme"].replace("gptq", "GPTQ", inplace=True)
@@ -103,10 +107,12 @@ def get_short_report(inference_report):
 
     short_report["Group"] = short_report["GPU"] + "-" + short_report["Quantization Scheme"]
 
+    short_report.to_csv(f"{report_folder}/short_report.csv")
+
     return short_report
 
 
-def get_rich_table(short_report):
+def get_rich_table(short_report, report_folder: str = "artifacts"):
     # create rich table
     rich_table = Table(show_header=True, show_lines=True)
     # we add a column for the index
@@ -118,10 +124,14 @@ def get_rich_table(short_report):
     for index, row in short_report.iterrows():
         rich_table.add_row(index, *format_row(row.values, style=""))
 
+    console = Console(record=True)
+    console.print(rich_table, justify="center")
+    console.save_svg(f"{report_folder}/rich_table.svg", theme=MONOKAI, title="Inference Report")
+
     return rich_table
 
 
-def get_throughput_plot(short_report):
+def get_plots(short_report, memory: str = "allocated", report_folder: str = "artifacts"):
     # for each quantization scheme we plot the throughput vs batch size
     fig1, ax1 = plt.subplots()
     fig2, ax2 = plt.subplots()
@@ -166,42 +176,45 @@ def get_throughput_plot(short_report):
             label=group,
             marker="o",
         )
-        ax3.plot(
-            forward_memory["Batch Size"],
-            forward_memory["Forward Max Memory Used (MB)"],
-            label=group + "-used",
-            marker="^",
-        )
-        ax3.plot(
-            forward_pytorch_max_memory_reserved["Batch Size"],
-            forward_pytorch_max_memory_reserved["Forward Max Memory Reserved (MB)"],
-            label=group + "-reserved",
-            marker=".",
-        )
-        ax3.plot(
-            forward_pytorch_max_memory_allocated["Batch Size"],
-            forward_pytorch_max_memory_allocated["Forward Max Memory Allocated (MB)"],
-            label=group + "-allocated",
-            marker="v",
-        )
-        ax4.plot(
-            generate_memory["Batch Size"],
-            generate_memory["Generate Max Memory Used (MB)"],
-            label=group + "-used",
-            marker="^",
-        )
-        ax4.plot(
-            generate_pytorch_max_memory_reserved["Batch Size"],
-            generate_pytorch_max_memory_reserved["Generate Max Memory Reserved (MB)"],
-            label=group + "-reserved",
-            marker=".",
-        )
-        ax4.plot(
-            generate_pytorch_max_memory_allocated["Batch Size"],
-            generate_pytorch_max_memory_allocated["Generate Max Memory Allocated (MB)"],
-            label=group + "-allocated",
-            marker="v",
-        )
+        if "used" in memory:
+            ax3.plot(
+                forward_memory["Batch Size"],
+                forward_memory["Forward Max Memory Used (MB)"],
+                label=group + "-used",
+                marker="^",
+            )
+            ax4.plot(
+                generate_memory["Batch Size"],
+                generate_memory["Generate Max Memory Used (MB)"],
+                label=group + "-used",
+                marker="^",
+            )
+        elif "reserved" in memory:
+            ax3.plot(
+                forward_pytorch_max_memory_reserved["Batch Size"],
+                forward_pytorch_max_memory_reserved["Forward Max Memory Reserved (MB)"],
+                label=group + "-reserved",
+                marker=".",
+            )
+            ax4.plot(
+                generate_pytorch_max_memory_reserved["Batch Size"],
+                generate_pytorch_max_memory_reserved["Generate Max Memory Reserved (MB)"],
+                label=group + "-reserved",
+                marker=".",
+            )
+        elif "allocated" in memory:
+            ax3.plot(
+                forward_pytorch_max_memory_allocated["Batch Size"],
+                forward_pytorch_max_memory_allocated["Forward Max Memory Allocated (MB)"],
+                label=group + "-allocated",
+                marker="*",
+            )
+            ax4.plot(
+                generate_pytorch_max_memory_allocated["Batch Size"],
+                generate_pytorch_max_memory_allocated["Generate Max Memory Allocated (MB)"],
+                label=group + "-allocated",
+                marker="*",
+            )
 
     ax1.set_xlabel("Batch Size")
     ax1.set_ylabel("Forward Latency (s)")
@@ -224,6 +237,11 @@ def get_throughput_plot(short_report):
     ax3.legend(fancybox=True, shadow=True)
     ax4.legend(fancybox=True, shadow=True)
 
+    fig1.savefig(f"{report_folder}/forward_latency_plot.png")
+    fig2.savefig(f"{report_folder}/generate_throughput_plot.png")
+    fig3.savefig(f"{report_folder}/forward_memory_plot.png")
+    fig4.savefig(f"{report_folder}/generate_memory_plot.png")
+
     return fig1, fig2, fig3, fig4
 
 
@@ -236,43 +254,41 @@ def generate_report():
         required=True,
         help="The folder containing the results of experiments.",
     )
+    parser.add_argument(
+        "--memory",
+        "-m",
+        nargs="*",
+        type=str,
+        required=True,
+        help="choose memory metric",
+        choices=["used", "reserved", "allocated"],
+        default="allocated",
+    )
     parser.add_argument(
         "--report-name",
         "-r",
         type=str,
         required=False,
+        default="artifacts",
         help="The name of the report.",
     )
 
     args = parser.parse_args()
+    report_folder = args.report_name
     experiments_folders = args.experiments
 
-    if args.report_name:
-        report_folder = f"artifacts/{args.report_name}"
-    else:
-        report_folder = "artifacts"
     Path(report_folder).mkdir(parents=True, exist_ok=True)
 
-    # gather experiments results
-    inference_report = gather_inference_report(experiments_folders)
-    inference_report.sort_values(by="forward.throughput(samples/s)", ascending=False, inplace=True)
-    inference_report.to_csv(f"{report_folder}/full_report.csv")
-
-    short_report = get_short_report(inference_report)
-    short_report.to_csv(f"{report_folder}/short_report.csv")
-
-    forward_throughput_plot, generate_throughput_plot, forward_memory_plot, generate_memory_plot = get_throughput_plot(
-        short_report
-    )
-    forward_throughput_plot.savefig(f"{report_folder}/forward_latency_plot.png")
-    generate_throughput_plot.savefig(f"{report_folder}/generate_throughput_plot.png")
-    forward_memory_plot.savefig(f"{report_folder}/forward_memory_plot.png")
-    generate_memory_plot.savefig(f"{report_folder}/generate_memory_plot.png")
+    if len(args.memory) == 0:
+        memory = ["used", "reserved", "allocated"]
+    else:
+        memory = args.memory
 
-    rich_table = get_rich_table(short_report)
-    console = Console(record=True)
-    console.print(rich_table, justify="center")
-    console.save_svg(f"{report_folder}/rich_table.svg", theme=MONOKAI, title="Inference Report")
+    # gather experiments results
+    full_report = gather_full_report(experiments_folders, report_folder=report_folder)
+    short_report = get_short_report(full_report, report_folder=report_folder)
+    figs = get_plots(short_report, memory=memory, report_folder=report_folder)
+    rich_table = get_rich_table(short_report, report_folder=report_folder)
 
 
 if __name__ == "__main__":