From cc5ee388bea7af255a746e8361f6a06992958cc1 Mon Sep 17 00:00:00 2001
From: Johnu George <johnu.george@nutanix.com>
Date: Wed, 26 Apr 2023 20:10:17 +0000
Subject: [PATCH 01/10] Changes for allowing runs with variable dataset sizes

---
 dlio_benchmark                    | 2 +-
 storage-conf/workload/bert.yaml   | 1 +
 storage-conf/workload/unet3d.yaml | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/dlio_benchmark b/dlio_benchmark
index b97f2a8..58db52b 160000
--- a/dlio_benchmark
+++ b/dlio_benchmark
@@ -1 +1 @@
-Subproject commit b97f2a8b34f4c7d21daf4eb3987df1b0aa1c4650
+Subproject commit 58db52b9e832361c637d9e454a81c0e63ba8aab8
diff --git a/storage-conf/workload/bert.yaml b/storage-conf/workload/bert.yaml
index 07cb044..d730132 100644
--- a/storage-conf/workload/bert.yaml
+++ b/storage-conf/workload/bert.yaml
@@ -17,6 +17,7 @@ dataset:
   file_prefix: part
 
 train:
+  seed_change_epoch: False
   computation_time: 0.968
   total_training_steps: 1000
  
diff --git a/storage-conf/workload/unet3d.yaml b/storage-conf/workload/unet3d.yaml
index 4e486f8..6f9b37c 100644
--- a/storage-conf/workload/unet3d.yaml
+++ b/storage-conf/workload/unet3d.yaml
@@ -23,6 +23,7 @@ reader:
   sample_shuffle: seed
 
 train:
+  seed_change_epoch: False
   epochs: 5
   computation_time: 1.3604
 

From 4f4d56bcb55e34f517d73a8cdf26b72748cd7e57 Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Tue, 16 May 2023 17:34:28 -0500
Subject: [PATCH 02/10] Update license header

---
 LICENSE.md | 305 +++++++++++++++++++++++++++++++----------------------
 1 file changed, 177 insertions(+), 128 deletions(-)

diff --git a/LICENSE.md b/LICENSE.md
index 96fe062..f433b1a 100644
--- a/LICENSE.md
+++ b/LICENSE.md
@@ -1,128 +1,177 @@
-MLCOMMONS ASSOCIATION LICENSE
-
-TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
-
-This license reproduces without alteration the terms of the Apache License
-
-Version 2.0, January 2004
-
-http://www.apache.org/licenses/
-
-1. Definitions.
-"License" shall mean the terms and conditions for use, reproduction, and distribution as defined by
-Sections 1 through 9 of this document.
-"Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is
-granting the License.
-"Legal Entity" shall mean the union of the acting entity and all other entities that control, are
-controlled by, or are under common control with that entity. For the purposes of this definition,
-"control" means (i) the power, direct or indirect, to cause the direction or management of such
-entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the
-outstanding shares, or (iii) beneficial ownership of such entity.
-"You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this
-License.
-"Source" form shall mean the preferred form for making modifications, including but not limited to
-software source code, documentation source, and configuration files.
-"Object" form shall mean any form resulting from mechanical transformation or translation of a
-Source form, including but not limited to compiled object code, generated documentation, and
-conversions to other media types.
-"Work" shall mean the work of authorship, whether in Source or Object form, made available under
-the License, as indicated by a copyright notice that is included in or attached to the work (an example
-is provided in the Appendix below).
-"Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or
-derived from) the Work and for which the editorial revisions, annotations, elaborations, or other
-modifications represent, as a whole, an original work of authorship. For the purposes of this License,
-Derivative Works shall not include works that remain separable from, or merely link (or bind by
-name) to the interfaces of, the Work and Derivative Works thereof.
-"Contribution" shall mean any work of authorship, including the original version of the Work and
-any modifications or additions to that Work or Derivative Works thereof, that is intentionally
-submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal
-Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition,
-"submitted" means any form of electronic, verbal, or written communication sent to the Licensor or
-its representatives, including but not limited to communication on electronic mailing lists, source
-code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor
-for the purpose of discussing and improving the Work, but excluding communication that is
-conspicuously marked or otherwise designated in writing by the copyright owner as "Not a
-Contribution."
-"Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a
-Contribution has been received by Licensor and subsequently incorporated within the Work.
-
-2. Grant of Copyright License. Subject to the terms and conditions of this License, each
-Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, 
-1106217.1
-irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly
-perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form.
-
-3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor
-hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable
-(except as stated in this section) patent license to make, have made, use, offer to sell, sell, import,
-and otherwise transfer the Work, where such license applies only to those patent claims licensable by
-such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of
-their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute
-patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging
-that the Work or a Contribution incorporated within the Work constitutes direct or contributory
-patent infringement, then any patent licenses granted to You under this License for that Work shall
-terminate as of the date such litigation is filed.
-
-4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works
-thereof in any medium, with or without modifications, and in Source or Object form, provided that
-You meet the following conditions:
-
-    1. You must give any other recipients of the Work or Derivative Works a copy of this License;
-and
-
-    2. You must cause any modified files to carry prominent notices stating that You changed the
-files; and
-
-    3. You must retain, in the Source form of any Derivative Works that You distribute, all
-copyright, patent, trademark, and attribution notices from the Source form of the Work,
-excluding those notices that do not pertain to any part of the Derivative Works; and
-
-    4. If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative
-Works that You distribute must include a readable copy of the attribution notices contained
-within such NOTICE file, excluding those notices that do not pertain to any part of the
-Derivative Works, in at least one of the following places: within a NOTICE text file
-distributed as part of the Derivative Works; within the Source form or documentation, if
-provided along with the Derivative Works; or, within a display generated by the Derivative
-Works, if and wherever such third-party notices normally appear. The contents of the
-NOTICE file are for informational purposes only and do not modify the License. You may
-add Your own attribution notices within Derivative Works that You distribute, alongside or
-as an addendum to the NOTICE text from the Work, provided that such additional
-attribution notices cannot be construed as modifying the License.
-You may add Your own copyright statement to Your modifications and may provide
-additional or different license terms and conditions for use, reproduction, or distribution of
-Your modifications, or for any such Derivative Works as a whole, provided Your use,
-reproduction, and distribution of the Work otherwise complies with the conditions stated in
-this License.
-
-5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution
-intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms
-and conditions of this License, without any additional terms or conditions. Notwithstanding the
-above, nothing herein shall supersede or modify the terms of any separate license agreement you
-may have executed with Licensor regarding such Contributions.
-6. Trademarks. This License does not grant permission to use the trade names, trademarks,
-service marks, or product names of the Licensor, except as required for reasonable and customary
-use in describing the origin of the Work and reproducing the content of the NOTICE file.
-
-7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor
-provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, 
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including,
-without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT,
-MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for
-determining the appropriateness of using or redistributing the Work and assume any risks associated
-with Your exercise of permissions under this License.
-
-8. Limitation of Liability. In no event and under no legal theory, whether in tort (including
-negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly
-negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including
-any direct, indirect, special, incidental, or consequential damages of any character arising as a result
-of this License or out of the use or inability to use the Work (including but not limited to damages for
-loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial
-damages or losses), even if such Contributor has been advised of the possibility of such damages.
-
-9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative
-Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty,
-indemnity, or other liability obligations and/or rights consistent with this License. However, in
-accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not
-on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each
-Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by
-reason of your accepting any such warranty or additional liability. 
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS

From 8eb04c5975c176ba9520a3a560ccd627417c4004 Mon Sep 17 00:00:00 2001
From: Nathan Wasson <nathanw@mlcommons.org>
Date: Tue, 16 May 2023 17:35:31 -0500
Subject: [PATCH 03/10] Create CODEOWNERS

---
 .github/CODEOWNERS | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 .github/CODEOWNERS

diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
new file mode 100644
index 0000000..2d2a1b2
--- /dev/null
+++ b/.github/CODEOWNERS
@@ -0,0 +1,3 @@
+# These owners will be the default owners for everything in the repo.
+# Unless a later match takes precedence,they will be requested for review when someone opens a pull request.
+* @mlcommons/wg-storage

From 5266a2b154b676cd3360e4fa675160db60bdcb20 Mon Sep 17 00:00:00 2001
From: Johnu George <johnu.george@nutanix.com>
Date: Thu, 18 May 2023 19:29:03 +0000
Subject: [PATCH 04/10] Update DLIO to latest

---
 dlio_benchmark | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dlio_benchmark b/dlio_benchmark
index 58db52b..130b18c 160000
--- a/dlio_benchmark
+++ b/dlio_benchmark
@@ -1 +1 @@
-Subproject commit 58db52b9e832361c637d9e454a81c0e63ba8aab8
+Subproject commit 130b18cb4c53dd5d603f9c72cbaff843ad34c5f0

From 3b1f07a816ac5c8caa8e38407de35e361591cc9f Mon Sep 17 00:00:00 2001
From: Johnu George <johnu.george@nutanix.com>
Date: Sat, 20 May 2023 20:17:37 +0000
Subject: [PATCH 05/10] Script changes for dataset subset selection with
 validation

---
 benchmark.sh                      |  10 +-
 report.py                         | 254 ++++++++++++++++--------------
 storage-conf/workload/bert.yaml   |   1 +
 storage-conf/workload/unet3d.yaml |   1 +
 4 files changed, 146 insertions(+), 120 deletions(-)

diff --git a/benchmark.sh b/benchmark.sh
index 8364917..efa6344 100755
--- a/benchmark.sh
+++ b/benchmark.sh
@@ -10,7 +10,7 @@ WORKLOADS=("unet3d" "bert")
 UNET3D_CONFIG_FILE=${CONFIG_PATH}/workload/unet3d.yaml
 BERT_CONFIG_FILE=${CONFIG_PATH}/workload/bert.yaml
 # Currently only "closed" category is supported
-CATEGORIES=("closed")
+CATEGORIES=("closed" "open")
 DEFAULT_CATEGORY="closed"
 CLOSED_CATEGORY_PARAMS=(
 	# dataset params
@@ -25,6 +25,12 @@ CLOSED_CATEGORY_PARAMS=(
 OPEN_CATEGORY_PARAMS=(
 	# all closed params
 	"${CLOSED_CATEGORY_PARAMS[@]}"
+	# framework params
+	"framework"
+	# dataset params
+	"dataset.format" "dataset.num_samples_per_file"
+	# reader params
+	"reader.data_loader" "reader.transfer_size"
 )
 HYDRA_OUTPUT_CONFIG_DIR="configs"
 EXTRA_PARAMS=(
@@ -272,7 +278,7 @@ configview() {
 
 postprocess() {
 	local results_dir=$1
-	python3 report.py --result-dir $results_dir --multi-host --create-report
+	python3 report.py --result-dir $results_dir
 }
 
 main() {
diff --git a/report.py b/report.py
index 9e7c106..d200642 100644
--- a/report.py
+++ b/report.py
@@ -4,6 +4,7 @@
 import logging
 import argparse
 import numpy as np
+from dateutil import parser
 
 # final report created by Storage benchmark run
 REPORT_FILE = "mlperf_storage_report.json"
@@ -14,139 +15,161 @@
 # summary file created by DLIO in the results folder after every run
 SUMMARY_FILE = "summary.json"
 
-class StorageReport(object):
+# minimum runs required for the submission
+REQUIRED_BENCHMARK_RUNS = 5
 
-    def __init__(self, args):
-        # summary file create
-        self.result_dir = args.result_dir
-        self.multi_host = args.multi_host
+# Maximum start time gap between host runs in seconds
+MAX_START_TIMESTAMP_GAP = 10
 
-    def find_file_path(self, directory):
+def find_file_path(directory):
         found_files = []
         for root, dirs, files in os.walk(directory):
             if SUMMARY_FILE in files:
                 found_files.append(os.path.join(root, SUMMARY_FILE))
         return found_files
 
-    def save_data(self, results):
-        # Dump statistic counters to files
-        # Overall stats
-        with open(REPORT_FILE, 'w') as outfile:
-            json.dump(results, outfile, indent=4,  default=str)
-        logging.info(f"Final report generated: {REPORT_FILE}")
-
-
-    # read summary for DLIO summary file
-    def get_summary(self, summary_file):
-        f = open(summary_file)
-        summary = json.load(f)
-        num_acclerators = summary['num_accelerators']
-        host_names =  summary['hostname']
-        au = summary['metric']['train_au_mean_percentage']
-        throughput_sps = summary['metric']['train_throughput_mean_samples_per_second']
-        throughput_mps = summary['metric']['train_io_mean_MB_per_second']
-        return (num_acclerators, au, throughput_sps, throughput_mps, host_names)
+def save_data(results):
+    # Dump statistic counters to files
+    # Overall stats
+    with open(REPORT_FILE, 'w') as outfile:
+        json.dump(results, outfile, indent=4,  default=str)
+    logging.info(f"Final report generated: {REPORT_FILE}")
+
+def check_unique(list_arg):
+    if len(set(list_arg)) == 1:
+        return True
+    else:
+        return False
+
+def check_timestamps(start_timestamps):
+    ts = list(map(lambda x: parser.parse(x),start_timestamps))
+    max_ts = max(ts)
+    min_ts = min(ts)
+    if (max_ts-min_ts).total_seconds() > MAX_START_TIMESTAMP_GAP:
+        return False
+    return True
+
+# read summary for DLIO summary file
+def get_summary(summary_file):
+    f = open(summary_file)
+    summary = json.load(f)
+    return summary
+
+class StorageReport(object):
+
+    def __init__(self, args):
+        # summary file create
+        self.result_dir = args.result_dir
 
     # accumulate results from multiple directories in case of multi hosts
     # report benchmark success or failure in case of a single host
     def generate_report(self):
         runs = {}
-        summary_files = self.find_file_path(self.result_dir)
+        summary_files = find_file_path(self.result_dir)
         if len(summary_files) == 0:
             logging.error(f"Error: {SUMMARY_FILE} file not found in {self.result_dir}")
             sys.exit(1)
-        # report benchmark success or failure in case of a single host
-        if not self.multi_host:
-            result = {}
-            if len(summary_files) > 1:
-                logging.error(f"Error: Multiple files found with the same file name {SUMMARY_FILE}")
+
+        # accumulate results from multiple directories in case of multi hosts
+        results={}
+        results["overall"] = {}
+        results["runs"] = {}
+        train_throughput = []
+        train_au = []
+        for summary_file in summary_files:
+            path = summary_file.split("/")
+            if len(path) != 4:
+                logging.error(f"Error: Directory structure {summary_file} is not correct. It has be in format result_dir/run(1..n)/host(1..n)/summary.json")
                 sys.exit(1)
+            run_name = path[1]
+            if run_name not in runs:
+                runs[run_name] = [summary_file]
             else:
-                #report success/failure and return
-                metrics = self.get_summary(summary_files[0])
-                num_acclerators = metrics[0]
-                au = metrics[1]
-                throughput_sps = metrics[2]
-                throughput_mps = metrics[3]
-
-                status = "succeeded" if float(au) >= AU_THRESHOLD else "failed"
-                logging.info("------------------------------")
-                logging.info(f"Benchmark {status}")
-                logging.info(f"Number of accelerators: {num_acclerators}")
-                logging.info(f"Average training throughput: {throughput_sps:.2f} samples/sec({throughput_mps:.2f} MB/sec)")
-                logging.info("------------------------------")
-                result = {"status": status}
-            return result
-        # accumulate results from multiple directories in case of multi hosts
-        else:
-            results={}
-            results["overall"] = {}
-            results["runs"] = {}
-            train_throughput = []
-            train_au = []
-            for summary_file in summary_files:
-                path = summary_file.split("/")
-                if len(path) != 4:
-                    logging.error(f"Error: Directory structure {summary_file} is not correct. It has be in format result_dir/run(1..n)/host(1..n)/summary.json")
+                runs[run_name].append(summary_file)
+        if len(runs) != REQUIRED_BENCHMARK_RUNS:
+            logging.error(f"Error: Results are reported only for {len(runs)} runs. {REQUIRED_BENCHMARK_RUNS} runs are required.")
+            sys.exit(1)
+        host_arr = [len(runs[run_name]) for run_name in runs]
+        if len(set(host_arr)) != 1:
+            logging.error("Error: Number of participating hosts must be same across all runs")
+            sys.exit(1)
+        num_hosts = host_arr[0]
+        for run_name in runs:
+            num_acclerators = []
+            train_throughput_sps = []
+            train_throughput_mps = []
+            host_names = []
+            num_files_train = []
+            num_samples_per_file = []
+            start_host_timestamp = []
+            results["runs"][run_name] ={}
+            for summary_file in runs[run_name]:
+                summary = get_summary(summary_file)
+                au = summary['metric']['train_au_mean_percentage']
+                if float(au) < AU_THRESHOLD:
+                    logging.error(f"Error: AU value didn't pass the threshold in the run reported by {summary_file}")
                     sys.exit(1)
-                run_name = path[1]
-                if run_name not in runs:
-                    runs[run_name] = [summary_file]
-                else:
-                    runs[run_name].append(summary_file)
-            host_arr = [len(runs[run_name]) for run_name in runs]
-            if len(set(host_arr)) != 1:
-                logging.error("Error: Number of participating hosts must be same across all runs")
+                num_acclerators.append(summary['num_accelerators'])
+                train_throughput_sps.append(summary['metric']['train_throughput_mean_samples_per_second'])
+                train_throughput_mps.append(summary['metric']['train_io_mean_MB_per_second'])
+                host_names.append(summary['hostname'])
+                num_files_train.append(summary['num_files_train'])
+                num_samples_per_file.append(summary['num_samples_per_file'])
+                start_host_timestamp.append(summary['start'])
+            if len(set(host_names)) != len(host_names):
+                logging.warning(f"Warning: Hostnames in results of run {run_name} are not unique.")
+
+            if not check_unique(num_acclerators):
+                logging.error(f"Error: Number of accelerators are different across hosts")
                 sys.exit(1)
-            num_hosts = host_arr[0]
-            for run_name in runs:
-                num_acclerators = []
-                train_throughput_sps = []
-                train_throughput_mps = []
-                host_names = []
-                results["runs"][run_name] ={}
-                for summary_file in runs[run_name]:
-                    summary = self.get_summary(summary_file)
-                    au = summary[1]
-                    if float(au) < AU_THRESHOLD:
-                        logging.error(f"Error: AU value didn't pass the threshold in the run reported by {summary_file}")
-                        sys.exit(1)
-                    num_acclerators.append(summary[0])
-                    train_throughput_sps.append(summary[1])
-                    train_throughput_mps.append(summary[3])
-                    host_names.append(summary[4])
-                if len(set(host_names)) != len(host_names):
-                    logging.warning(f"Warning: Hostnames in results of run {run_name} are not unique")
-
-                #results["runs"][run_name]["num_acclerators"] = num_acclerators
-                #results["runs"][run_name]["train_throughput_samples_per_second"] = train_throughput_sps
-                #results["runs"][run_name]["train_throughput_MB_per_second"] = train_throughput_mps
-                results["runs"][run_name]["train_throughput_samples_per_second"] = np.sum(np.array(train_throughput_sps))
-                results["runs"][run_name]["train_throughput_MB_per_second"] = np.sum(np.array(train_throughput_mps))
-                results["runs"][run_name]["train_num_accelerators"] = np.sum(np.array(num_acclerators))
-
-            overall_train_throughput_sps = [results["runs"][run_name]["train_throughput_samples_per_second"] for run_name in results["runs"]]
-            overall_train_throughput_mps = [results["runs"][run_name]["train_throughput_MB_per_second"] for run_name in results["runs"]]
-            overall_train_num_accelerators = [results["runs"][run_name]["train_num_accelerators"] for run_name in results["runs"]]
-
-            if len(set(overall_train_num_accelerators)) != 1:
-                logging.error(f"Error: Number of accelerators are different across runs")
+            if not check_unique(num_files_train):
+                logging.error(f"Error: Number of training files are different across hosts")
                 sys.exit(1)
-            results["overall"]["num_client_hosts"] = num_hosts
-            results["overall"]["num_benchmark_runs"] = len(results["runs"])
-            results["overall"]["train_num_accelerators"] =  overall_train_num_accelerators[0]
-            results["overall"]["train_throughput_mean_samples_per_second"] = np.mean(overall_train_throughput_sps)
-            results["overall"]["train_throughput_stdev_samples_per_second"] = np.std(overall_train_throughput_sps)
-            results["overall"]["train_throughput_mean_MB_per_second"] = np.mean(overall_train_throughput_mps)
-            results["overall"]["train_throughput_stdev_MB_per_second"] = np.std(overall_train_throughput_mps)
-            results["overall"]["train_num_accelerators"] =  overall_train_num_accelerators[0]
-            logging.info("------------------------------")
-            logging.info(f'Number of client hosts: {results["overall"]["num_client_hosts"]}')
-            logging.info(f'Number of benchmark runs: {results["overall"]["num_benchmark_runs"]}')
-            logging.info(f'Overall number of accelerators: {results["overall"]["train_num_accelerators"]}')
-            logging.info(f'Overall Training Throughput (samples/second): {results["overall"]["train_throughput_mean_samples_per_second"]:.2f} ({results["overall"]["train_throughput_stdev_samples_per_second"]})')
-            logging.info(f'Overall Training Throughput (MB/second): {results["overall"]["train_throughput_mean_MB_per_second"]:.2f} ({results["overall"]["train_throughput_stdev_MB_per_second"]})')
-            logging.info("------------------------------")
+            if not check_unique(num_samples_per_file):
+                logging.error(f"Error: Number of samples per file are different across hosts")
+                sys.exit(1)
+            if not check_timestamps(start_host_timestamp):
+                logging.error(f"Error: Start timestamps of all hosts in each run must be within {MAX_START_TIMESTAMP_GAP} sec")
+                sys.exit(1)
+
+            results["runs"][run_name]["train_throughput_samples_per_second"] = np.sum(np.array(train_throughput_sps))
+            results["runs"][run_name]["train_throughput_MB_per_second"] = np.sum(np.array(train_throughput_mps))
+            results["runs"][run_name]["train_num_accelerators"] = np.sum(np.array(num_acclerators))
+            results["runs"][run_name]["num_files_train"] = num_files_train[0]
+            results["runs"][run_name]["num_samples_per_file"] = num_samples_per_file[0]
+
+        overall_train_throughput_sps = [results["runs"][run_name]["train_throughput_samples_per_second"] for run_name in results["runs"]]
+        overall_train_throughput_mps = [results["runs"][run_name]["train_throughput_MB_per_second"] for run_name in results["runs"]]
+        overall_train_num_accelerators = [results["runs"][run_name]["train_num_accelerators"] for run_name in results["runs"]]
+        overall_num_files_train = [results["runs"][run_name]["num_files_train"] for run_name in results["runs"]]
+        overall_num_samples_per_file = [results["runs"][run_name]["num_samples_per_file"] for run_name in results["runs"]]
+
+        if not check_unique(overall_train_num_accelerators):
+            logging.error(f"Error: Number of accelerators are different across runs")
+            sys.exit(1)
+        if not check_unique(overall_num_files_train):
+            logging.error(f"Error: Number of training files are different across runs")
+            sys.exit(1)
+        if not check_unique(overall_num_samples_per_file):
+            logging.error(f"Error: Number of samples per file are different across runs")
+            sys.exit(1)
+
+        results["overall"]["num_client_hosts"] = num_hosts
+        results["overall"]["num_benchmark_runs"] = len(results["runs"])
+        results["overall"]["train_num_accelerators"] =  overall_train_num_accelerators[0]
+        results["overall"]["num_files_train"] = overall_num_files_train[0]
+        results["overall"]["num_samples_per_file"] = overall_num_samples_per_file[0]
+        results["overall"]["train_throughput_mean_samples_per_second"] = np.mean(overall_train_throughput_sps)
+        results["overall"]["train_throughput_stdev_samples_per_second"] = np.std(overall_train_throughput_sps)
+        results["overall"]["train_throughput_mean_MB_per_second"] = np.mean(overall_train_throughput_mps)
+        results["overall"]["train_throughput_stdev_MB_per_second"] = np.std(overall_train_throughput_mps)
+        logging.info("------------------------------")
+        logging.info(f'Number of client hosts: {results["overall"]["num_client_hosts"]}')
+        logging.info(f'Number of benchmark runs: {results["overall"]["num_benchmark_runs"]}')
+        logging.info(f'Overall number of accelerators: {results["overall"]["train_num_accelerators"]}')
+        logging.info(f'Overall Training Throughput (samples/second): {results["overall"]["train_throughput_mean_samples_per_second"]:.2f} ({results["overall"]["train_throughput_stdev_samples_per_second"]})')
+        logging.info(f'Overall Training Throughput (MB/second): {results["overall"]["train_throughput_mean_MB_per_second"]:.2f} ({results["overall"]["train_throughput_stdev_MB_per_second"]})')
+        logging.info("------------------------------")
         return results
 
 def main():
@@ -156,10 +179,6 @@ def main():
     parser = argparse.ArgumentParser(description='Storage report generator')
     parser.add_argument("-rd", "--result-dir", type=str, default="",
                         help="Location to the results directory of a benchmark run which contains summary.json")
-    parser.add_argument("-sh", "--multi-host", action="store_true",
-            help="If set, multi host results are considered else single host results are considered")
-    parser.add_argument("-rg", "--create-report", action="store_true",
-        help="If set, result report file generation will be created  ")
     logging.basicConfig(
             format='%(asctime)s %(message)s',
             level=logging.DEBUG,
@@ -167,8 +186,7 @@ def main():
     args = parser.parse_args()
     postproc = StorageReport(args)
     results = postproc.generate_report()
-    if args.create_report:
-        postproc.save_data(results)
+    save_data(results)
 
 if __name__ == '__main__':
     main()
diff --git a/storage-conf/workload/bert.yaml b/storage-conf/workload/bert.yaml
index d730132..a7e6a66 100644
--- a/storage-conf/workload/bert.yaml
+++ b/storage-conf/workload/bert.yaml
@@ -29,6 +29,7 @@ reader:
   batch_size: 48
   file_shuffle: seed
   sample_shuffle: seed
+  shuffle_size: 1024
 
 checkpoint:
   checkpoint_folder: checkpoints/bert
diff --git a/storage-conf/workload/unet3d.yaml b/storage-conf/workload/unet3d.yaml
index 6f9b37c..d0e8409 100644
--- a/storage-conf/workload/unet3d.yaml
+++ b/storage-conf/workload/unet3d.yaml
@@ -21,6 +21,7 @@ reader:
   read_threads: 4
   file_shuffle: seed
   sample_shuffle: seed
+  shuffle_size: 32
 
 train:
   seed_change_epoch: False

From 9485ca8396957edc18c5a40cf3b4e8b6850a2c1e Mon Sep 17 00:00:00 2001
From: Johnu George <johnu.george@nutanix.com>
Date: Sun, 21 May 2023 12:41:58 +0000
Subject: [PATCH 06/10] Add shuffle size

---
 storage-conf/workload/bert.yaml   | 1 -
 storage-conf/workload/unet3d.yaml | 2 +-
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/storage-conf/workload/bert.yaml b/storage-conf/workload/bert.yaml
index a7e6a66..d730132 100644
--- a/storage-conf/workload/bert.yaml
+++ b/storage-conf/workload/bert.yaml
@@ -29,7 +29,6 @@ reader:
   batch_size: 48
   file_shuffle: seed
   sample_shuffle: seed
-  shuffle_size: 1024
 
 checkpoint:
   checkpoint_folder: checkpoints/bert
diff --git a/storage-conf/workload/unet3d.yaml b/storage-conf/workload/unet3d.yaml
index d0e8409..cce3e09 100644
--- a/storage-conf/workload/unet3d.yaml
+++ b/storage-conf/workload/unet3d.yaml
@@ -21,7 +21,7 @@ reader:
   read_threads: 4
   file_shuffle: seed
   sample_shuffle: seed
-  shuffle_size: 32
+  shuffle_size: 4
 
 train:
   seed_change_epoch: False

From a93ac5bb703b6070e4dfb2c21288b38b874d7826 Mon Sep 17 00:00:00 2001
From: Johnu George <johnu.george@nutanix.com>
Date: Sun, 21 May 2023 13:23:48 +0000
Subject: [PATCH 07/10] Fix result path

---
 report.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/report.py b/report.py
index d200642..b10d345 100644
--- a/report.py
+++ b/report.py
@@ -77,11 +77,12 @@ def generate_report(self):
         train_throughput = []
         train_au = []
         for summary_file in summary_files:
-            path = summary_file.split("/")
-            if len(path) != 4:
+            run_path = os.path.relpath(summary_file, self.result_dir)
+            run_dir = run_path.split("/")
+            if len(run_dir) != 3:
                 logging.error(f"Error: Directory structure {summary_file} is not correct. It has be in format result_dir/run(1..n)/host(1..n)/summary.json")
                 sys.exit(1)
-            run_name = path[1]
+            run_name = run_dir[0]
             if run_name not in runs:
                 runs[run_name] = [summary_file]
             else:

From 952bfc4cfc87f9519799011d10b181734d2ddb27 Mon Sep 17 00:00:00 2001
From: Johnu George <johnu.george@nutanix.com>
Date: Sun, 21 May 2023 18:34:55 +0000
Subject: [PATCH 08/10] Update README.md

---
 README.md | 65 +++++++++++++++++++++++++++++++++++++------------------
 1 file changed, 44 insertions(+), 21 deletions(-)

diff --git a/README.md b/README.md
index cf66296..1555802 100644
--- a/README.md
+++ b/README.md
@@ -1,15 +1,18 @@
 # MLPerf™ Storage Benchmark Suite
 MLPerf Storage is a benchmark suite to characterize the performance of storage systems that support machine learning workloads.
 
-- [Overview](#Overview) 
-- [Installation](#Installation)
-- [Configuration](#Configuration)
-- [Workloads](#Workloads)
-	- [U-Net3D](#U-Net3D)
-	- [BERT](#BERT) 
-	- [DLRM](#DLRM)
-- [Parameters](#Parameters)      
-- [Releases](#Releases)
+- [Overview](#overview)
+- [Installation](#installation)
+- [Configuration](#configuration)
+- [Workloads](#workloads)
+	- [U-Net3D](#u-net3d)
+	- [BERT](#bert)
+	- [DLRM](#dlrm)
+- [Parameters](#parameters)
+	- [CLOSED](#closed)
+	- [OPEN](#open)
+- [Releases](#releases)
+- [Submission Rules](#submission-rules)
 ## Overview
 
 This section describes how to use the MLPerf™ Storage Benchmark to measure the performance of a storage system supporting a compute cluster running AI/ML training tasks.
@@ -69,6 +72,7 @@ The working directory structure is as follows
 ```
 |---storage
        |---benchmark.sh
+	   |---report.py
        |---dlio_benchmark
        |---storage-conf
            |---workload(folder contains configs of all workloads)
@@ -165,13 +169,13 @@ For running benchmark on `unet3d` workload with data located in `unet3d_data` di
 ./benchmark.sh run --workload unet3d --num-accelerators 4 --results-dir unet3d_results --param dataset.data_folder=unet3d_data
 ```
 
-4. Reports are generated from the benchmark results
+4. Benchmark submission report is generated by aggregating the individual run results.
 
 ```bash
 ./benchmark.sh reportgen -h
 
 Usage: ./benchmark.sh reportgen [options]
-Generate a report from the benchmark results. Supports single host and multi host run.
+Generate a report from the benchmark results.
 
 
 Options:
@@ -179,7 +183,7 @@ Options:
   -r, --results-dir		Location to the results directory
 ```
 
-For multi-host run, the results need to be in the following structure.
+The result directory needs to be in the following structure which must include 5 runs.
 
 ```
 sample-results
@@ -200,7 +204,7 @@ sample-results
  	       |---host-n
  	               |---summary.json
 	    .....
-	|---run-n
+	|---run-5
 	       |---host-1
 	                |---summary.json
 	       |---host-2
@@ -210,12 +214,13 @@ sample-results
  	               |---summary.json
 ```
 
-To generate multi host report,
+To generate the benchmark report,
 
 ```bash
 ./benchmark.sh reportgen --results-dir  sample-results/
 ```
 
+For reference, a sample result directory structure can be found [here](https://github.com/johnugeorge/mlperf-storage-sample-results). 
 
 ## Workloads
 Currently, the storage benchmark suite supports benchmarking of 3 deep learning workloads
@@ -223,7 +228,7 @@ Currently, the storage benchmark suite supports benchmarking of 3 deep learning
 - Natural language processing using BERT model ([bert](./storage-conf/workloads/bert.yaml))
 - Recommendation using DLRM model (TODO)
 
-### U-Net3D Workload
+### U-Net3D
 
 Calculate minimum dataset size required for the benchmark run
 
@@ -243,14 +248,14 @@ Run the benchmark.
 ./benchmark.sh run --workload unet3d --num-accelerators 8 --param dataset.num_files_train=3200
 ```
 
-All results will be stored in ```results/unet3d/$DATE-$TIME``` folder or in the directory when overriden using `--results-dir`(or `-r`) argument. To generate the final report, one can do
+All results will be stored in ```results/unet3d/$DATE-$TIME``` folder or in the directory when overridden using `--results-dir`(or `-r`) argument. To generate the final report, one can do
 
 ```bash 
 ./benchmark.sh reportgen --results-dir results/unet3d/$DATE-$TIME
 ```
 This will generate ```mlperf_storage_report.json``` in the output folder.
 
-### BERT Workload
+### BERT
 
 Calculate minimum dataset size required for the benchmark run
 
@@ -269,7 +274,7 @@ Run the benchmark
 ./benchmark.sh run --workload bert --num-accelerators 8 --param dataset.num_files_train=350
 ```
 
-All results will be stored in ```results/bert/$DATE-$TIME``` folder or in the directory when overriden using `--results-dir`(or `-r`) argument. To generate the final report, one can do
+All results will be stored in ```results/bert/$DATE-$TIME``` folder or in the directory when overridden using `--results-dir`(or `-r`) argument. To generate the final report, one can do
 
 ```bash 
 ./benchmark.sh reportgen -r results/bert/$DATE-$TIME
@@ -277,13 +282,14 @@ All results will be stored in ```results/bert/$DATE-$TIME``` folder or in the di
 This will generate ```mlperf_storage_report.json``` in the output folder.
 
 
-### DLRM Workload
+### DLRM
 
 To be added
 
 ## Parameters 
 
-Below table displays the list of configurable paramters for the benchmark. 
+### CLOSED
+Below table displays the list of configurable parameters for the benchmark in the closed category.
 
 | Parameter                      | Description                                                 |Default|
 | ------------------------------ | ------------------------------------------------------------ |-------|
@@ -293,10 +299,27 @@ Below table displays the list of configurable paramters for the benchmark.
 | dataset.data_folder           | The path where dataset is stored				| --|
 | **Reader params**				|						|   |
 | reader.read_threads		| Number of threads to load the data                            | --|
-| reader.computation_threads    | Number of threads to preprocess the data(only for bert)       | --|
+| reader.computation_threads    | Number of threads to preprocess the data(only for Bert)       | --|
 | **Checkpoint params**		|								|   |
 | checkpoint.checkpoint_folder	| The folder to save the checkpoints  				| --|
 | **Storage params**		|								|   |
 | storage.storage_root		| The storage root directory  					| ./|
 | storage.storage_type		| The storage type  						|local_fs|
 
+
+### OPEN
+In addition to what can be changed in the CLOSED category, the following parameters can be changed in the OPEN category.
+
+| Parameter                      | Description                                                 |Default|
+| ------------------------------ | ------------------------------------------------------------ |-------|
+| framework		| The machine learning framework		|Pytorch for 3D U-Net, Tensorflow for Bert |
+| **Dataset params**		|								|   |
+| dataset.format       | Format of the dataset  		        | .npz for 3D U-Net and tfrecord for Bert|
+| dataset.num_samples_per_file       | Number of samples per file(only for Tensorflow using tfrecord datasets)  		        | For 3D U-Net: 1 and for Bert: 313532|
+| **Reader params**		|
+| reader.data_loader       | Data loader type(Tensorflow or PyTorch or custom) 		        | PyTorch for 3D U-Net, and Tensorflow for Bert|
+| reader.transfer_size       | Number of bytes in the read buffer(only for Tensorflow)  		        | For BERT: 262144|
+
+## Submission Rules
+
+MLPerf™ Storage Benchmark submission rules are described in this [doc](https://docs.google.com/document/d/1QOaCLiWb82H9cwdVX5KyeDZWt0781y4SgMQPhoij-b4/edit). If you have questions, please contact [Storage WG chairs](https://mlcommons.org/en/groups/research-storage/).

From d4ca943a82d8fee8fd4a62afc632f3ca7d8f147f Mon Sep 17 00:00:00 2001
From: Johnu George <johnu.george@nutanix.com>
Date: Sun, 21 May 2023 18:39:41 +0000
Subject: [PATCH 09/10] Update Readme

---
 README.md | 1 -
 1 file changed, 1 deletion(-)

diff --git a/README.md b/README.md
index 1555802..f95312c 100644
--- a/README.md
+++ b/README.md
@@ -11,7 +11,6 @@ MLPerf Storage is a benchmark suite to characterize the performance of storage s
 - [Parameters](#parameters)
 	- [CLOSED](#closed)
 	- [OPEN](#open)
-- [Releases](#releases)
 - [Submission Rules](#submission-rules)
 ## Overview
 

From b3f098c7da12217329d87d1be6236305d5ba323c Mon Sep 17 00:00:00 2001
From: Johnu George <johnu.george@nutanix.com>
Date: Tue, 23 May 2023 10:26:26 +0000
Subject: [PATCH 10/10] Adding model name to report

---
 dlio_benchmark |  2 +-
 report.py      | 24 ++++++++++++++++++------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/dlio_benchmark b/dlio_benchmark
index 130b18c..ad006e7 160000
--- a/dlio_benchmark
+++ b/dlio_benchmark
@@ -1 +1 @@
-Subproject commit 130b18cb4c53dd5d603f9c72cbaff843ad34c5f0
+Subproject commit ad006e7d9a9ec3d9aceccc369001495ffeba1928
diff --git a/report.py b/report.py
index b10d345..deec441 100644
--- a/report.py
+++ b/report.py
@@ -96,6 +96,7 @@ def generate_report(self):
             sys.exit(1)
         num_hosts = host_arr[0]
         for run_name in runs:
+            models = []
             num_acclerators = []
             train_throughput_sps = []
             train_throughput_mps = []
@@ -110,6 +111,7 @@ def generate_report(self):
                 if float(au) < AU_THRESHOLD:
                     logging.error(f"Error: AU value didn't pass the threshold in the run reported by {summary_file}")
                     sys.exit(1)
+                models.append(summary['model'])
                 num_acclerators.append(summary['num_accelerators'])
                 train_throughput_sps.append(summary['metric']['train_throughput_mean_samples_per_second'])
                 train_throughput_mps.append(summary['metric']['train_io_mean_MB_per_second'])
@@ -120,14 +122,17 @@ def generate_report(self):
             if len(set(host_names)) != len(host_names):
                 logging.warning(f"Warning: Hostnames in results of run {run_name} are not unique.")
 
+            if not check_unique(models):
+                logging.error(f"Error: The model name is different across hosts")
+                sys.exit(1)
             if not check_unique(num_acclerators):
-                logging.error(f"Error: Number of accelerators are different across hosts")
+                logging.error(f"Error: The number of accelerators is different across hosts")
                 sys.exit(1)
             if not check_unique(num_files_train):
-                logging.error(f"Error: Number of training files are different across hosts")
+                logging.error(f"Error: The number of training files is different across hosts")
                 sys.exit(1)
             if not check_unique(num_samples_per_file):
-                logging.error(f"Error: Number of samples per file are different across hosts")
+                logging.error(f"Error: The number of samples per file is different across hosts")
                 sys.exit(1)
             if not check_timestamps(start_host_timestamp):
                 logging.error(f"Error: Start timestamps of all hosts in each run must be within {MAX_START_TIMESTAMP_GAP} sec")
@@ -136,25 +141,31 @@ def generate_report(self):
             results["runs"][run_name]["train_throughput_samples_per_second"] = np.sum(np.array(train_throughput_sps))
             results["runs"][run_name]["train_throughput_MB_per_second"] = np.sum(np.array(train_throughput_mps))
             results["runs"][run_name]["train_num_accelerators"] = np.sum(np.array(num_acclerators))
+            results["runs"][run_name]["model"] = models[0]
             results["runs"][run_name]["num_files_train"] = num_files_train[0]
             results["runs"][run_name]["num_samples_per_file"] = num_samples_per_file[0]
 
         overall_train_throughput_sps = [results["runs"][run_name]["train_throughput_samples_per_second"] for run_name in results["runs"]]
         overall_train_throughput_mps = [results["runs"][run_name]["train_throughput_MB_per_second"] for run_name in results["runs"]]
+        overall_model = [results["runs"][run_name]["model"] for run_name in results["runs"]]
         overall_train_num_accelerators = [results["runs"][run_name]["train_num_accelerators"] for run_name in results["runs"]]
         overall_num_files_train = [results["runs"][run_name]["num_files_train"] for run_name in results["runs"]]
         overall_num_samples_per_file = [results["runs"][run_name]["num_samples_per_file"] for run_name in results["runs"]]
 
+        if not check_unique(overall_model):
+            logging.error(f"Error: The model name is different across runs")
+            sys.exit(1)
         if not check_unique(overall_train_num_accelerators):
-            logging.error(f"Error: Number of accelerators are different across runs")
+            logging.error(f"Error: The number of accelerators is different across runs")
             sys.exit(1)
         if not check_unique(overall_num_files_train):
-            logging.error(f"Error: Number of training files are different across runs")
+            logging.error(f"Error: The number of training files is different across runs")
             sys.exit(1)
         if not check_unique(overall_num_samples_per_file):
-            logging.error(f"Error: Number of samples per file are different across runs")
+            logging.error(f"Error: The number of samples per file is different across runs")
             sys.exit(1)
 
+        results["overall"]["model"] = overall_model[0]
         results["overall"]["num_client_hosts"] = num_hosts
         results["overall"]["num_benchmark_runs"] = len(results["runs"])
         results["overall"]["train_num_accelerators"] =  overall_train_num_accelerators[0]
@@ -165,6 +176,7 @@ def generate_report(self):
         results["overall"]["train_throughput_mean_MB_per_second"] = np.mean(overall_train_throughput_mps)
         results["overall"]["train_throughput_stdev_MB_per_second"] = np.std(overall_train_throughput_mps)
         logging.info("------------------------------")
+        logging.info(f'Model: {results["overall"]["model"]}')
         logging.info(f'Number of client hosts: {results["overall"]["num_client_hosts"]}')
         logging.info(f'Number of benchmark runs: {results["overall"]["num_benchmark_runs"]}')
         logging.info(f'Overall number of accelerators: {results["overall"]["train_num_accelerators"]}')