From cc5ee388bea7af255a746e8361f6a06992958cc1 Mon Sep 17 00:00:00 2001 From: Johnu George Date: Wed, 26 Apr 2023 20:10:17 +0000 Subject: [PATCH 01/10] Changes for allowing runs with variable dataset sizes --- dlio_benchmark | 2 +- storage-conf/workload/bert.yaml | 1 + storage-conf/workload/unet3d.yaml | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/dlio_benchmark b/dlio_benchmark index b97f2a8..58db52b 160000 --- a/dlio_benchmark +++ b/dlio_benchmark @@ -1 +1 @@ -Subproject commit b97f2a8b34f4c7d21daf4eb3987df1b0aa1c4650 +Subproject commit 58db52b9e832361c637d9e454a81c0e63ba8aab8 diff --git a/storage-conf/workload/bert.yaml b/storage-conf/workload/bert.yaml index 07cb044..d730132 100644 --- a/storage-conf/workload/bert.yaml +++ b/storage-conf/workload/bert.yaml @@ -17,6 +17,7 @@ dataset: file_prefix: part train: + seed_change_epoch: False computation_time: 0.968 total_training_steps: 1000 diff --git a/storage-conf/workload/unet3d.yaml b/storage-conf/workload/unet3d.yaml index 4e486f8..6f9b37c 100644 --- a/storage-conf/workload/unet3d.yaml +++ b/storage-conf/workload/unet3d.yaml @@ -23,6 +23,7 @@ reader: sample_shuffle: seed train: + seed_change_epoch: False epochs: 5 computation_time: 1.3604 From 4f4d56bcb55e34f517d73a8cdf26b72748cd7e57 Mon Sep 17 00:00:00 2001 From: Nathan Wasson Date: Tue, 16 May 2023 17:34:28 -0500 Subject: [PATCH 02/10] Update license header --- LICENSE.md | 305 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 177 insertions(+), 128 deletions(-) diff --git a/LICENSE.md b/LICENSE.md index 96fe062..f433b1a 100644 --- a/LICENSE.md +++ b/LICENSE.md @@ -1,128 +1,177 @@ -MLCOMMONS ASSOCIATION LICENSE - -TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - -This license reproduces without alteration the terms of the Apache License - -Version 2.0, January 2004 - -http://www.apache.org/licenses/ - -1. Definitions. -"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by -Sections 1 through 9 of this document. -"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is -granting the License. -"Legal Entity" shall mean the union of the acting entity and all other entities that control, are -controlled by, or are under common control with that entity. For the purposes of this definition, -"control" means (i) the power, direct or indirect, to cause the direction or management of such -entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the -outstanding shares, or (iii) beneficial ownership of such entity. -"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this -License. -"Source" form shall mean the preferred form for making modifications, including but not limited to -software source code, documentation source, and configuration files. -"Object" form shall mean any form resulting from mechanical transformation or translation of a -Source form, including but not limited to compiled object code, generated documentation, and -conversions to other media types. -"Work" shall mean the work of authorship, whether in Source or Object form, made available under -the License, as indicated by a copyright notice that is included in or attached to the work (an example -is provided in the Appendix below). -"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or -derived from) the Work and for which the editorial revisions, annotations, elaborations, or other -modifications represent, as a whole, an original work of authorship. For the purposes of this License, -Derivative Works shall not include works that remain separable from, or merely link (or bind by -name) to the interfaces of, the Work and Derivative Works thereof. -"Contribution" shall mean any work of authorship, including the original version of the Work and -any modifications or additions to that Work or Derivative Works thereof, that is intentionally -submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal -Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, -"submitted" means any form of electronic, verbal, or written communication sent to the Licensor or -its representatives, including but not limited to communication on electronic mailing lists, source -code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor -for the purpose of discussing and improving the Work, but excluding communication that is -conspicuously marked or otherwise designated in writing by the copyright owner as "Not a -Contribution." -"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a -Contribution has been received by Licensor and subsequently incorporated within the Work. - -2. Grant of Copyright License. Subject to the terms and conditions of this License, each -Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, -1106217.1 -irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly -perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. - -3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor -hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable -(except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, -and otherwise transfer the Work, where such license applies only to those patent claims licensable by -such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of -their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute -patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging -that the Work or a Contribution incorporated within the Work constitutes direct or contributory -patent infringement, then any patent licenses granted to You under this License for that Work shall -terminate as of the date such litigation is filed. - -4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works -thereof in any medium, with or without modifications, and in Source or Object form, provided that -You meet the following conditions: - - 1. You must give any other recipients of the Work or Derivative Works a copy of this License; -and - - 2. You must cause any modified files to carry prominent notices stating that You changed the -files; and - - 3. You must retain, in the Source form of any Derivative Works that You distribute, all -copyright, patent, trademark, and attribution notices from the Source form of the Work, -excluding those notices that do not pertain to any part of the Derivative Works; and - - 4. If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative -Works that You distribute must include a readable copy of the attribution notices contained -within such NOTICE file, excluding those notices that do not pertain to any part of the -Derivative Works, in at least one of the following places: within a NOTICE text file -distributed as part of the Derivative Works; within the Source form or documentation, if -provided along with the Derivative Works; or, within a display generated by the Derivative -Works, if and wherever such third-party notices normally appear. The contents of the -NOTICE file are for informational purposes only and do not modify the License. You may -add Your own attribution notices within Derivative Works that You distribute, alongside or -as an addendum to the NOTICE text from the Work, provided that such additional -attribution notices cannot be construed as modifying the License. -You may add Your own copyright statement to Your modifications and may provide -additional or different license terms and conditions for use, reproduction, or distribution of -Your modifications, or for any such Derivative Works as a whole, provided Your use, -reproduction, and distribution of the Work otherwise complies with the conditions stated in -this License. - -5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution -intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms -and conditions of this License, without any additional terms or conditions. Notwithstanding the -above, nothing herein shall supersede or modify the terms of any separate license agreement you -may have executed with Licensor regarding such Contributions. -6. Trademarks. This License does not grant permission to use the trade names, trademarks, -service marks, or product names of the Licensor, except as required for reasonable and customary -use in describing the origin of the Work and reproducing the content of the NOTICE file. - -7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor -provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, -without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, -MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for -determining the appropriateness of using or redistributing the Work and assume any risks associated -with Your exercise of permissions under this License. - -8. Limitation of Liability. In no event and under no legal theory, whether in tort (including -negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly -negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including -any direct, indirect, special, incidental, or consequential damages of any character arising as a result -of this License or out of the use or inability to use the Work (including but not limited to damages for -loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial -damages or losses), even if such Contributor has been advised of the possibility of such damages. - -9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative -Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, -indemnity, or other liability obligations and/or rights consistent with this License. However, in -accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not -on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each -Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by -reason of your accepting any such warranty or additional liability. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS From 8eb04c5975c176ba9520a3a560ccd627417c4004 Mon Sep 17 00:00:00 2001 From: Nathan Wasson Date: Tue, 16 May 2023 17:35:31 -0500 Subject: [PATCH 03/10] Create CODEOWNERS --- .github/CODEOWNERS | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 .github/CODEOWNERS diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS new file mode 100644 index 0000000..2d2a1b2 --- /dev/null +++ b/.github/CODEOWNERS @@ -0,0 +1,3 @@ +# These owners will be the default owners for everything in the repo. +# Unless a later match takes precedence,they will be requested for review when someone opens a pull request. +* @mlcommons/wg-storage From 5266a2b154b676cd3360e4fa675160db60bdcb20 Mon Sep 17 00:00:00 2001 From: Johnu George Date: Thu, 18 May 2023 19:29:03 +0000 Subject: [PATCH 04/10] Update DLIO to latest --- dlio_benchmark | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dlio_benchmark b/dlio_benchmark index 58db52b..130b18c 160000 --- a/dlio_benchmark +++ b/dlio_benchmark @@ -1 +1 @@ -Subproject commit 58db52b9e832361c637d9e454a81c0e63ba8aab8 +Subproject commit 130b18cb4c53dd5d603f9c72cbaff843ad34c5f0 From 3b1f07a816ac5c8caa8e38407de35e361591cc9f Mon Sep 17 00:00:00 2001 From: Johnu George Date: Sat, 20 May 2023 20:17:37 +0000 Subject: [PATCH 05/10] Script changes for dataset subset selection with validation --- benchmark.sh | 10 +- report.py | 254 ++++++++++++++++-------------- storage-conf/workload/bert.yaml | 1 + storage-conf/workload/unet3d.yaml | 1 + 4 files changed, 146 insertions(+), 120 deletions(-) diff --git a/benchmark.sh b/benchmark.sh index 8364917..efa6344 100755 --- a/benchmark.sh +++ b/benchmark.sh @@ -10,7 +10,7 @@ WORKLOADS=("unet3d" "bert") UNET3D_CONFIG_FILE=${CONFIG_PATH}/workload/unet3d.yaml BERT_CONFIG_FILE=${CONFIG_PATH}/workload/bert.yaml # Currently only "closed" category is supported -CATEGORIES=("closed") +CATEGORIES=("closed" "open") DEFAULT_CATEGORY="closed" CLOSED_CATEGORY_PARAMS=( # dataset params @@ -25,6 +25,12 @@ CLOSED_CATEGORY_PARAMS=( OPEN_CATEGORY_PARAMS=( # all closed params "${CLOSED_CATEGORY_PARAMS[@]}" + # framework params + "framework" + # dataset params + "dataset.format" "dataset.num_samples_per_file" + # reader params + "reader.data_loader" "reader.transfer_size" ) HYDRA_OUTPUT_CONFIG_DIR="configs" EXTRA_PARAMS=( @@ -272,7 +278,7 @@ configview() { postprocess() { local results_dir=$1 - python3 report.py --result-dir $results_dir --multi-host --create-report + python3 report.py --result-dir $results_dir } main() { diff --git a/report.py b/report.py index 9e7c106..d200642 100644 --- a/report.py +++ b/report.py @@ -4,6 +4,7 @@ import logging import argparse import numpy as np +from dateutil import parser # final report created by Storage benchmark run REPORT_FILE = "mlperf_storage_report.json" @@ -14,139 +15,161 @@ # summary file created by DLIO in the results folder after every run SUMMARY_FILE = "summary.json" -class StorageReport(object): +# minimum runs required for the submission +REQUIRED_BENCHMARK_RUNS = 5 - def __init__(self, args): - # summary file create - self.result_dir = args.result_dir - self.multi_host = args.multi_host +# Maximum start time gap between host runs in seconds +MAX_START_TIMESTAMP_GAP = 10 - def find_file_path(self, directory): +def find_file_path(directory): found_files = [] for root, dirs, files in os.walk(directory): if SUMMARY_FILE in files: found_files.append(os.path.join(root, SUMMARY_FILE)) return found_files - def save_data(self, results): - # Dump statistic counters to files - # Overall stats - with open(REPORT_FILE, 'w') as outfile: - json.dump(results, outfile, indent=4, default=str) - logging.info(f"Final report generated: {REPORT_FILE}") - - - # read summary for DLIO summary file - def get_summary(self, summary_file): - f = open(summary_file) - summary = json.load(f) - num_acclerators = summary['num_accelerators'] - host_names = summary['hostname'] - au = summary['metric']['train_au_mean_percentage'] - throughput_sps = summary['metric']['train_throughput_mean_samples_per_second'] - throughput_mps = summary['metric']['train_io_mean_MB_per_second'] - return (num_acclerators, au, throughput_sps, throughput_mps, host_names) +def save_data(results): + # Dump statistic counters to files + # Overall stats + with open(REPORT_FILE, 'w') as outfile: + json.dump(results, outfile, indent=4, default=str) + logging.info(f"Final report generated: {REPORT_FILE}") + +def check_unique(list_arg): + if len(set(list_arg)) == 1: + return True + else: + return False + +def check_timestamps(start_timestamps): + ts = list(map(lambda x: parser.parse(x),start_timestamps)) + max_ts = max(ts) + min_ts = min(ts) + if (max_ts-min_ts).total_seconds() > MAX_START_TIMESTAMP_GAP: + return False + return True + +# read summary for DLIO summary file +def get_summary(summary_file): + f = open(summary_file) + summary = json.load(f) + return summary + +class StorageReport(object): + + def __init__(self, args): + # summary file create + self.result_dir = args.result_dir # accumulate results from multiple directories in case of multi hosts # report benchmark success or failure in case of a single host def generate_report(self): runs = {} - summary_files = self.find_file_path(self.result_dir) + summary_files = find_file_path(self.result_dir) if len(summary_files) == 0: logging.error(f"Error: {SUMMARY_FILE} file not found in {self.result_dir}") sys.exit(1) - # report benchmark success or failure in case of a single host - if not self.multi_host: - result = {} - if len(summary_files) > 1: - logging.error(f"Error: Multiple files found with the same file name {SUMMARY_FILE}") + + # accumulate results from multiple directories in case of multi hosts + results={} + results["overall"] = {} + results["runs"] = {} + train_throughput = [] + train_au = [] + for summary_file in summary_files: + path = summary_file.split("/") + if len(path) != 4: + logging.error(f"Error: Directory structure {summary_file} is not correct. It has be in format result_dir/run(1..n)/host(1..n)/summary.json") sys.exit(1) + run_name = path[1] + if run_name not in runs: + runs[run_name] = [summary_file] else: - #report success/failure and return - metrics = self.get_summary(summary_files[0]) - num_acclerators = metrics[0] - au = metrics[1] - throughput_sps = metrics[2] - throughput_mps = metrics[3] - - status = "succeeded" if float(au) >= AU_THRESHOLD else "failed" - logging.info("------------------------------") - logging.info(f"Benchmark {status}") - logging.info(f"Number of accelerators: {num_acclerators}") - logging.info(f"Average training throughput: {throughput_sps:.2f} samples/sec({throughput_mps:.2f} MB/sec)") - logging.info("------------------------------") - result = {"status": status} - return result - # accumulate results from multiple directories in case of multi hosts - else: - results={} - results["overall"] = {} - results["runs"] = {} - train_throughput = [] - train_au = [] - for summary_file in summary_files: - path = summary_file.split("/") - if len(path) != 4: - logging.error(f"Error: Directory structure {summary_file} is not correct. It has be in format result_dir/run(1..n)/host(1..n)/summary.json") + runs[run_name].append(summary_file) + if len(runs) != REQUIRED_BENCHMARK_RUNS: + logging.error(f"Error: Results are reported only for {len(runs)} runs. {REQUIRED_BENCHMARK_RUNS} runs are required.") + sys.exit(1) + host_arr = [len(runs[run_name]) for run_name in runs] + if len(set(host_arr)) != 1: + logging.error("Error: Number of participating hosts must be same across all runs") + sys.exit(1) + num_hosts = host_arr[0] + for run_name in runs: + num_acclerators = [] + train_throughput_sps = [] + train_throughput_mps = [] + host_names = [] + num_files_train = [] + num_samples_per_file = [] + start_host_timestamp = [] + results["runs"][run_name] ={} + for summary_file in runs[run_name]: + summary = get_summary(summary_file) + au = summary['metric']['train_au_mean_percentage'] + if float(au) < AU_THRESHOLD: + logging.error(f"Error: AU value didn't pass the threshold in the run reported by {summary_file}") sys.exit(1) - run_name = path[1] - if run_name not in runs: - runs[run_name] = [summary_file] - else: - runs[run_name].append(summary_file) - host_arr = [len(runs[run_name]) for run_name in runs] - if len(set(host_arr)) != 1: - logging.error("Error: Number of participating hosts must be same across all runs") + num_acclerators.append(summary['num_accelerators']) + train_throughput_sps.append(summary['metric']['train_throughput_mean_samples_per_second']) + train_throughput_mps.append(summary['metric']['train_io_mean_MB_per_second']) + host_names.append(summary['hostname']) + num_files_train.append(summary['num_files_train']) + num_samples_per_file.append(summary['num_samples_per_file']) + start_host_timestamp.append(summary['start']) + if len(set(host_names)) != len(host_names): + logging.warning(f"Warning: Hostnames in results of run {run_name} are not unique.") + + if not check_unique(num_acclerators): + logging.error(f"Error: Number of accelerators are different across hosts") sys.exit(1) - num_hosts = host_arr[0] - for run_name in runs: - num_acclerators = [] - train_throughput_sps = [] - train_throughput_mps = [] - host_names = [] - results["runs"][run_name] ={} - for summary_file in runs[run_name]: - summary = self.get_summary(summary_file) - au = summary[1] - if float(au) < AU_THRESHOLD: - logging.error(f"Error: AU value didn't pass the threshold in the run reported by {summary_file}") - sys.exit(1) - num_acclerators.append(summary[0]) - train_throughput_sps.append(summary[1]) - train_throughput_mps.append(summary[3]) - host_names.append(summary[4]) - if len(set(host_names)) != len(host_names): - logging.warning(f"Warning: Hostnames in results of run {run_name} are not unique") - - #results["runs"][run_name]["num_acclerators"] = num_acclerators - #results["runs"][run_name]["train_throughput_samples_per_second"] = train_throughput_sps - #results["runs"][run_name]["train_throughput_MB_per_second"] = train_throughput_mps - results["runs"][run_name]["train_throughput_samples_per_second"] = np.sum(np.array(train_throughput_sps)) - results["runs"][run_name]["train_throughput_MB_per_second"] = np.sum(np.array(train_throughput_mps)) - results["runs"][run_name]["train_num_accelerators"] = np.sum(np.array(num_acclerators)) - - overall_train_throughput_sps = [results["runs"][run_name]["train_throughput_samples_per_second"] for run_name in results["runs"]] - overall_train_throughput_mps = [results["runs"][run_name]["train_throughput_MB_per_second"] for run_name in results["runs"]] - overall_train_num_accelerators = [results["runs"][run_name]["train_num_accelerators"] for run_name in results["runs"]] - - if len(set(overall_train_num_accelerators)) != 1: - logging.error(f"Error: Number of accelerators are different across runs") + if not check_unique(num_files_train): + logging.error(f"Error: Number of training files are different across hosts") sys.exit(1) - results["overall"]["num_client_hosts"] = num_hosts - results["overall"]["num_benchmark_runs"] = len(results["runs"]) - results["overall"]["train_num_accelerators"] = overall_train_num_accelerators[0] - results["overall"]["train_throughput_mean_samples_per_second"] = np.mean(overall_train_throughput_sps) - results["overall"]["train_throughput_stdev_samples_per_second"] = np.std(overall_train_throughput_sps) - results["overall"]["train_throughput_mean_MB_per_second"] = np.mean(overall_train_throughput_mps) - results["overall"]["train_throughput_stdev_MB_per_second"] = np.std(overall_train_throughput_mps) - results["overall"]["train_num_accelerators"] = overall_train_num_accelerators[0] - logging.info("------------------------------") - logging.info(f'Number of client hosts: {results["overall"]["num_client_hosts"]}') - logging.info(f'Number of benchmark runs: {results["overall"]["num_benchmark_runs"]}') - logging.info(f'Overall number of accelerators: {results["overall"]["train_num_accelerators"]}') - logging.info(f'Overall Training Throughput (samples/second): {results["overall"]["train_throughput_mean_samples_per_second"]:.2f} ({results["overall"]["train_throughput_stdev_samples_per_second"]})') - logging.info(f'Overall Training Throughput (MB/second): {results["overall"]["train_throughput_mean_MB_per_second"]:.2f} ({results["overall"]["train_throughput_stdev_MB_per_second"]})') - logging.info("------------------------------") + if not check_unique(num_samples_per_file): + logging.error(f"Error: Number of samples per file are different across hosts") + sys.exit(1) + if not check_timestamps(start_host_timestamp): + logging.error(f"Error: Start timestamps of all hosts in each run must be within {MAX_START_TIMESTAMP_GAP} sec") + sys.exit(1) + + results["runs"][run_name]["train_throughput_samples_per_second"] = np.sum(np.array(train_throughput_sps)) + results["runs"][run_name]["train_throughput_MB_per_second"] = np.sum(np.array(train_throughput_mps)) + results["runs"][run_name]["train_num_accelerators"] = np.sum(np.array(num_acclerators)) + results["runs"][run_name]["num_files_train"] = num_files_train[0] + results["runs"][run_name]["num_samples_per_file"] = num_samples_per_file[0] + + overall_train_throughput_sps = [results["runs"][run_name]["train_throughput_samples_per_second"] for run_name in results["runs"]] + overall_train_throughput_mps = [results["runs"][run_name]["train_throughput_MB_per_second"] for run_name in results["runs"]] + overall_train_num_accelerators = [results["runs"][run_name]["train_num_accelerators"] for run_name in results["runs"]] + overall_num_files_train = [results["runs"][run_name]["num_files_train"] for run_name in results["runs"]] + overall_num_samples_per_file = [results["runs"][run_name]["num_samples_per_file"] for run_name in results["runs"]] + + if not check_unique(overall_train_num_accelerators): + logging.error(f"Error: Number of accelerators are different across runs") + sys.exit(1) + if not check_unique(overall_num_files_train): + logging.error(f"Error: Number of training files are different across runs") + sys.exit(1) + if not check_unique(overall_num_samples_per_file): + logging.error(f"Error: Number of samples per file are different across runs") + sys.exit(1) + + results["overall"]["num_client_hosts"] = num_hosts + results["overall"]["num_benchmark_runs"] = len(results["runs"]) + results["overall"]["train_num_accelerators"] = overall_train_num_accelerators[0] + results["overall"]["num_files_train"] = overall_num_files_train[0] + results["overall"]["num_samples_per_file"] = overall_num_samples_per_file[0] + results["overall"]["train_throughput_mean_samples_per_second"] = np.mean(overall_train_throughput_sps) + results["overall"]["train_throughput_stdev_samples_per_second"] = np.std(overall_train_throughput_sps) + results["overall"]["train_throughput_mean_MB_per_second"] = np.mean(overall_train_throughput_mps) + results["overall"]["train_throughput_stdev_MB_per_second"] = np.std(overall_train_throughput_mps) + logging.info("------------------------------") + logging.info(f'Number of client hosts: {results["overall"]["num_client_hosts"]}') + logging.info(f'Number of benchmark runs: {results["overall"]["num_benchmark_runs"]}') + logging.info(f'Overall number of accelerators: {results["overall"]["train_num_accelerators"]}') + logging.info(f'Overall Training Throughput (samples/second): {results["overall"]["train_throughput_mean_samples_per_second"]:.2f} ({results["overall"]["train_throughput_stdev_samples_per_second"]})') + logging.info(f'Overall Training Throughput (MB/second): {results["overall"]["train_throughput_mean_MB_per_second"]:.2f} ({results["overall"]["train_throughput_stdev_MB_per_second"]})') + logging.info("------------------------------") return results def main(): @@ -156,10 +179,6 @@ def main(): parser = argparse.ArgumentParser(description='Storage report generator') parser.add_argument("-rd", "--result-dir", type=str, default="", help="Location to the results directory of a benchmark run which contains summary.json") - parser.add_argument("-sh", "--multi-host", action="store_true", - help="If set, multi host results are considered else single host results are considered") - parser.add_argument("-rg", "--create-report", action="store_true", - help="If set, result report file generation will be created ") logging.basicConfig( format='%(asctime)s %(message)s', level=logging.DEBUG, @@ -167,8 +186,7 @@ def main(): args = parser.parse_args() postproc = StorageReport(args) results = postproc.generate_report() - if args.create_report: - postproc.save_data(results) + save_data(results) if __name__ == '__main__': main() diff --git a/storage-conf/workload/bert.yaml b/storage-conf/workload/bert.yaml index d730132..a7e6a66 100644 --- a/storage-conf/workload/bert.yaml +++ b/storage-conf/workload/bert.yaml @@ -29,6 +29,7 @@ reader: batch_size: 48 file_shuffle: seed sample_shuffle: seed + shuffle_size: 1024 checkpoint: checkpoint_folder: checkpoints/bert diff --git a/storage-conf/workload/unet3d.yaml b/storage-conf/workload/unet3d.yaml index 6f9b37c..d0e8409 100644 --- a/storage-conf/workload/unet3d.yaml +++ b/storage-conf/workload/unet3d.yaml @@ -21,6 +21,7 @@ reader: read_threads: 4 file_shuffle: seed sample_shuffle: seed + shuffle_size: 32 train: seed_change_epoch: False From 9485ca8396957edc18c5a40cf3b4e8b6850a2c1e Mon Sep 17 00:00:00 2001 From: Johnu George Date: Sun, 21 May 2023 12:41:58 +0000 Subject: [PATCH 06/10] Add shuffle size --- storage-conf/workload/bert.yaml | 1 - storage-conf/workload/unet3d.yaml | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/storage-conf/workload/bert.yaml b/storage-conf/workload/bert.yaml index a7e6a66..d730132 100644 --- a/storage-conf/workload/bert.yaml +++ b/storage-conf/workload/bert.yaml @@ -29,7 +29,6 @@ reader: batch_size: 48 file_shuffle: seed sample_shuffle: seed - shuffle_size: 1024 checkpoint: checkpoint_folder: checkpoints/bert diff --git a/storage-conf/workload/unet3d.yaml b/storage-conf/workload/unet3d.yaml index d0e8409..cce3e09 100644 --- a/storage-conf/workload/unet3d.yaml +++ b/storage-conf/workload/unet3d.yaml @@ -21,7 +21,7 @@ reader: read_threads: 4 file_shuffle: seed sample_shuffle: seed - shuffle_size: 32 + shuffle_size: 4 train: seed_change_epoch: False From a93ac5bb703b6070e4dfb2c21288b38b874d7826 Mon Sep 17 00:00:00 2001 From: Johnu George Date: Sun, 21 May 2023 13:23:48 +0000 Subject: [PATCH 07/10] Fix result path --- report.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/report.py b/report.py index d200642..b10d345 100644 --- a/report.py +++ b/report.py @@ -77,11 +77,12 @@ def generate_report(self): train_throughput = [] train_au = [] for summary_file in summary_files: - path = summary_file.split("/") - if len(path) != 4: + run_path = os.path.relpath(summary_file, self.result_dir) + run_dir = run_path.split("/") + if len(run_dir) != 3: logging.error(f"Error: Directory structure {summary_file} is not correct. It has be in format result_dir/run(1..n)/host(1..n)/summary.json") sys.exit(1) - run_name = path[1] + run_name = run_dir[0] if run_name not in runs: runs[run_name] = [summary_file] else: From 952bfc4cfc87f9519799011d10b181734d2ddb27 Mon Sep 17 00:00:00 2001 From: Johnu George Date: Sun, 21 May 2023 18:34:55 +0000 Subject: [PATCH 08/10] Update README.md --- README.md | 65 +++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/README.md b/README.md index cf66296..1555802 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,18 @@ # MLPerf™ Storage Benchmark Suite MLPerf Storage is a benchmark suite to characterize the performance of storage systems that support machine learning workloads. -- [Overview](#Overview) -- [Installation](#Installation) -- [Configuration](#Configuration) -- [Workloads](#Workloads) - - [U-Net3D](#U-Net3D) - - [BERT](#BERT) - - [DLRM](#DLRM) -- [Parameters](#Parameters) -- [Releases](#Releases) +- [Overview](#overview) +- [Installation](#installation) +- [Configuration](#configuration) +- [Workloads](#workloads) + - [U-Net3D](#u-net3d) + - [BERT](#bert) + - [DLRM](#dlrm) +- [Parameters](#parameters) + - [CLOSED](#closed) + - [OPEN](#open) +- [Releases](#releases) +- [Submission Rules](#submission-rules) ## Overview This section describes how to use the MLPerf™ Storage Benchmark to measure the performance of a storage system supporting a compute cluster running AI/ML training tasks. @@ -69,6 +72,7 @@ The working directory structure is as follows ``` |---storage |---benchmark.sh + |---report.py |---dlio_benchmark |---storage-conf |---workload(folder contains configs of all workloads) @@ -165,13 +169,13 @@ For running benchmark on `unet3d` workload with data located in `unet3d_data` di ./benchmark.sh run --workload unet3d --num-accelerators 4 --results-dir unet3d_results --param dataset.data_folder=unet3d_data ``` -4. Reports are generated from the benchmark results +4. Benchmark submission report is generated by aggregating the individual run results. ```bash ./benchmark.sh reportgen -h Usage: ./benchmark.sh reportgen [options] -Generate a report from the benchmark results. Supports single host and multi host run. +Generate a report from the benchmark results. Options: @@ -179,7 +183,7 @@ Options: -r, --results-dir Location to the results directory ``` -For multi-host run, the results need to be in the following structure. +The result directory needs to be in the following structure which must include 5 runs. ``` sample-results @@ -200,7 +204,7 @@ sample-results |---host-n |---summary.json ..... - |---run-n + |---run-5 |---host-1 |---summary.json |---host-2 @@ -210,12 +214,13 @@ sample-results |---summary.json ``` -To generate multi host report, +To generate the benchmark report, ```bash ./benchmark.sh reportgen --results-dir sample-results/ ``` +For reference, a sample result directory structure can be found [here](https://github.com/johnugeorge/mlperf-storage-sample-results). ## Workloads Currently, the storage benchmark suite supports benchmarking of 3 deep learning workloads @@ -223,7 +228,7 @@ Currently, the storage benchmark suite supports benchmarking of 3 deep learning - Natural language processing using BERT model ([bert](./storage-conf/workloads/bert.yaml)) - Recommendation using DLRM model (TODO) -### U-Net3D Workload +### U-Net3D Calculate minimum dataset size required for the benchmark run @@ -243,14 +248,14 @@ Run the benchmark. ./benchmark.sh run --workload unet3d --num-accelerators 8 --param dataset.num_files_train=3200 ``` -All results will be stored in ```results/unet3d/$DATE-$TIME``` folder or in the directory when overriden using `--results-dir`(or `-r`) argument. To generate the final report, one can do +All results will be stored in ```results/unet3d/$DATE-$TIME``` folder or in the directory when overridden using `--results-dir`(or `-r`) argument. To generate the final report, one can do ```bash ./benchmark.sh reportgen --results-dir results/unet3d/$DATE-$TIME ``` This will generate ```mlperf_storage_report.json``` in the output folder. -### BERT Workload +### BERT Calculate minimum dataset size required for the benchmark run @@ -269,7 +274,7 @@ Run the benchmark ./benchmark.sh run --workload bert --num-accelerators 8 --param dataset.num_files_train=350 ``` -All results will be stored in ```results/bert/$DATE-$TIME``` folder or in the directory when overriden using `--results-dir`(or `-r`) argument. To generate the final report, one can do +All results will be stored in ```results/bert/$DATE-$TIME``` folder or in the directory when overridden using `--results-dir`(or `-r`) argument. To generate the final report, one can do ```bash ./benchmark.sh reportgen -r results/bert/$DATE-$TIME @@ -277,13 +282,14 @@ All results will be stored in ```results/bert/$DATE-$TIME``` folder or in the di This will generate ```mlperf_storage_report.json``` in the output folder. -### DLRM Workload +### DLRM To be added ## Parameters -Below table displays the list of configurable paramters for the benchmark. +### CLOSED +Below table displays the list of configurable parameters for the benchmark in the closed category. | Parameter | Description |Default| | ------------------------------ | ------------------------------------------------------------ |-------| @@ -293,10 +299,27 @@ Below table displays the list of configurable paramters for the benchmark. | dataset.data_folder | The path where dataset is stored | --| | **Reader params** | | | | reader.read_threads | Number of threads to load the data | --| -| reader.computation_threads | Number of threads to preprocess the data(only for bert) | --| +| reader.computation_threads | Number of threads to preprocess the data(only for Bert) | --| | **Checkpoint params** | | | | checkpoint.checkpoint_folder | The folder to save the checkpoints | --| | **Storage params** | | | | storage.storage_root | The storage root directory | ./| | storage.storage_type | The storage type |local_fs| + +### OPEN +In addition to what can be changed in the CLOSED category, the following parameters can be changed in the OPEN category. + +| Parameter | Description |Default| +| ------------------------------ | ------------------------------------------------------------ |-------| +| framework | The machine learning framework |Pytorch for 3D U-Net, Tensorflow for Bert | +| **Dataset params** | | | +| dataset.format | Format of the dataset | .npz for 3D U-Net and tfrecord for Bert| +| dataset.num_samples_per_file | Number of samples per file(only for Tensorflow using tfrecord datasets) | For 3D U-Net: 1 and for Bert: 313532| +| **Reader params** | +| reader.data_loader | Data loader type(Tensorflow or PyTorch or custom) | PyTorch for 3D U-Net, and Tensorflow for Bert| +| reader.transfer_size | Number of bytes in the read buffer(only for Tensorflow) | For BERT: 262144| + +## Submission Rules + +MLPerf™ Storage Benchmark submission rules are described in this [doc](https://docs.google.com/document/d/1QOaCLiWb82H9cwdVX5KyeDZWt0781y4SgMQPhoij-b4/edit). If you have questions, please contact [Storage WG chairs](https://mlcommons.org/en/groups/research-storage/). From d4ca943a82d8fee8fd4a62afc632f3ca7d8f147f Mon Sep 17 00:00:00 2001 From: Johnu George Date: Sun, 21 May 2023 18:39:41 +0000 Subject: [PATCH 09/10] Update Readme --- README.md | 1 - 1 file changed, 1 deletion(-) diff --git a/README.md b/README.md index 1555802..f95312c 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,6 @@ MLPerf Storage is a benchmark suite to characterize the performance of storage s - [Parameters](#parameters) - [CLOSED](#closed) - [OPEN](#open) -- [Releases](#releases) - [Submission Rules](#submission-rules) ## Overview From b3f098c7da12217329d87d1be6236305d5ba323c Mon Sep 17 00:00:00 2001 From: Johnu George Date: Tue, 23 May 2023 10:26:26 +0000 Subject: [PATCH 10/10] Adding model name to report --- dlio_benchmark | 2 +- report.py | 24 ++++++++++++++++++------ 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/dlio_benchmark b/dlio_benchmark index 130b18c..ad006e7 160000 --- a/dlio_benchmark +++ b/dlio_benchmark @@ -1 +1 @@ -Subproject commit 130b18cb4c53dd5d603f9c72cbaff843ad34c5f0 +Subproject commit ad006e7d9a9ec3d9aceccc369001495ffeba1928 diff --git a/report.py b/report.py index b10d345..deec441 100644 --- a/report.py +++ b/report.py @@ -96,6 +96,7 @@ def generate_report(self): sys.exit(1) num_hosts = host_arr[0] for run_name in runs: + models = [] num_acclerators = [] train_throughput_sps = [] train_throughput_mps = [] @@ -110,6 +111,7 @@ def generate_report(self): if float(au) < AU_THRESHOLD: logging.error(f"Error: AU value didn't pass the threshold in the run reported by {summary_file}") sys.exit(1) + models.append(summary['model']) num_acclerators.append(summary['num_accelerators']) train_throughput_sps.append(summary['metric']['train_throughput_mean_samples_per_second']) train_throughput_mps.append(summary['metric']['train_io_mean_MB_per_second']) @@ -120,14 +122,17 @@ def generate_report(self): if len(set(host_names)) != len(host_names): logging.warning(f"Warning: Hostnames in results of run {run_name} are not unique.") + if not check_unique(models): + logging.error(f"Error: The model name is different across hosts") + sys.exit(1) if not check_unique(num_acclerators): - logging.error(f"Error: Number of accelerators are different across hosts") + logging.error(f"Error: The number of accelerators is different across hosts") sys.exit(1) if not check_unique(num_files_train): - logging.error(f"Error: Number of training files are different across hosts") + logging.error(f"Error: The number of training files is different across hosts") sys.exit(1) if not check_unique(num_samples_per_file): - logging.error(f"Error: Number of samples per file are different across hosts") + logging.error(f"Error: The number of samples per file is different across hosts") sys.exit(1) if not check_timestamps(start_host_timestamp): logging.error(f"Error: Start timestamps of all hosts in each run must be within {MAX_START_TIMESTAMP_GAP} sec") @@ -136,25 +141,31 @@ def generate_report(self): results["runs"][run_name]["train_throughput_samples_per_second"] = np.sum(np.array(train_throughput_sps)) results["runs"][run_name]["train_throughput_MB_per_second"] = np.sum(np.array(train_throughput_mps)) results["runs"][run_name]["train_num_accelerators"] = np.sum(np.array(num_acclerators)) + results["runs"][run_name]["model"] = models[0] results["runs"][run_name]["num_files_train"] = num_files_train[0] results["runs"][run_name]["num_samples_per_file"] = num_samples_per_file[0] overall_train_throughput_sps = [results["runs"][run_name]["train_throughput_samples_per_second"] for run_name in results["runs"]] overall_train_throughput_mps = [results["runs"][run_name]["train_throughput_MB_per_second"] for run_name in results["runs"]] + overall_model = [results["runs"][run_name]["model"] for run_name in results["runs"]] overall_train_num_accelerators = [results["runs"][run_name]["train_num_accelerators"] for run_name in results["runs"]] overall_num_files_train = [results["runs"][run_name]["num_files_train"] for run_name in results["runs"]] overall_num_samples_per_file = [results["runs"][run_name]["num_samples_per_file"] for run_name in results["runs"]] + if not check_unique(overall_model): + logging.error(f"Error: The model name is different across runs") + sys.exit(1) if not check_unique(overall_train_num_accelerators): - logging.error(f"Error: Number of accelerators are different across runs") + logging.error(f"Error: The number of accelerators is different across runs") sys.exit(1) if not check_unique(overall_num_files_train): - logging.error(f"Error: Number of training files are different across runs") + logging.error(f"Error: The number of training files is different across runs") sys.exit(1) if not check_unique(overall_num_samples_per_file): - logging.error(f"Error: Number of samples per file are different across runs") + logging.error(f"Error: The number of samples per file is different across runs") sys.exit(1) + results["overall"]["model"] = overall_model[0] results["overall"]["num_client_hosts"] = num_hosts results["overall"]["num_benchmark_runs"] = len(results["runs"]) results["overall"]["train_num_accelerators"] = overall_train_num_accelerators[0] @@ -165,6 +176,7 @@ def generate_report(self): results["overall"]["train_throughput_mean_MB_per_second"] = np.mean(overall_train_throughput_mps) results["overall"]["train_throughput_stdev_MB_per_second"] = np.std(overall_train_throughput_mps) logging.info("------------------------------") + logging.info(f'Model: {results["overall"]["model"]}') logging.info(f'Number of client hosts: {results["overall"]["num_client_hosts"]}') logging.info(f'Number of benchmark runs: {results["overall"]["num_benchmark_runs"]}') logging.info(f'Overall number of accelerators: {results["overall"]["train_num_accelerators"]}')