diff --git a/CODEOWNERS b/CODEOWNERS
index 2694bc18e..8421573a1 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -8,6 +8,7 @@ datasets @ashahba @claynerobison @dzungductran
 docs @claynerobison @mhbuehler
 k8s  @ashahba @dzungductran
 models @ashraf-bhuiyan @riverliuintel
+models @riverliuintel
 models/**/pytorch/ @leslie-fang-intel @jiayisunx @zhuhaozhe
 quickstart mahathi.vatsal.salopanthula@intel.com
 quickstart/**/pytorch/ @leslie-fang-intel @jiayisunx @zhuhaozhe
diff --git a/README.md b/README.md
index 4869713d4..4c200a12b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # Model Zoo for Intel® Architecture
 
-This repository contains **links to pre-trained models, sample scripts, best practices, and step-by-step tutorials** for many popular open-source machine learning models optimized by Intel to run on Intel® Xeon® Scalable processors.
+This repository contains **links to pre-trained models, sample scripts, best practices, and step-by-step tutorials** for many popular open-source machine learning models optimized by Intel to run on Intel® Xeon® Scalable processors and Intel® Data Center GPUs.
 
 Model packages and containers for running the Model Zoo's workloads can be found at the [Intel® Developer Catalog](https://software.intel.com/containers).
 
diff --git a/benchmarks/common/base_benchmark_util.py b/benchmarks/common/base_benchmark_util.py
index 73340b759..58cd8bbb9 100644
--- a/benchmarks/common/base_benchmark_util.py
+++ b/benchmarks/common/base_benchmark_util.py
@@ -1,7 +1,7 @@
 #
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2023 Intel Corporation
+# Copyright (c) 2018-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -281,6 +281,12 @@ def _define_args(self):
             help="Additional command line arguments (prefix flag start with"
                  " '--').")
 
+        # Check if GPU is enabled.
+        self._common_arg_parser.add_argument(
+            "--gpu",
+            help="Run the benchmark script using GPU",
+            dest="gpu", action="store_true")
+
     def _validate_args(self):
         """validate the args and initializes platform_util"""
         # check if socket id is in socket number range
@@ -311,8 +317,9 @@ def _validate_args(self):
                              format(system_num_cores))
 
         if args.output_results and ((args.model_name != "resnet50" and
-                                    args.model_name != "resnet50v1_5") or args.precision != "fp32"):
-            raise ValueError("--output-results is currently only supported for resnet50 FP32 inference.")
+                                    args.model_name != "resnet50v1_5") or
+                                    (args.precision != "fp32" and args.precision != "fp16")):
+            raise ValueError("--output-results is currently only supported for resnet50 FP32 or FP16 inference.")
         elif args.output_results and (args.mode != "inference" or not args.data_location):
             raise ValueError("--output-results can only be used when running inference with a dataset.")
 
@@ -355,6 +362,14 @@ def _validate_args(self):
                       "This is less than the number of cores per socket on the system ({})".
                       format(args.socket_id, cpuset_len_for_socket, self._platform_util.num_cores_per_socket))
 
+        if args.gpu:
+            if args.socket_id != -1:
+                raise ValueError("--socket-id cannot be used with --gpu parameter.")
+            if args.num_intra_threads is not None:
+                raise ValueError("--num-intra-threads cannot be used with --gpu parameter.")
+            if args.num_inter_threads is not None:
+                raise ValueError("--num-inter-threads cannot be used with --gpu parameter.")
+
     def initialize_model(self, args, unknown_args):
         """Create model initializer for the specified model"""
         model_initializer = None
diff --git a/benchmarks/common/tensorflow/start.sh b/benchmarks/common/tensorflow/start.sh
index 84368747f..9dbb0ee76 100644
--- a/benchmarks/common/tensorflow/start.sh
+++ b/benchmarks/common/tensorflow/start.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 #
-# Copyright (c) 2023 Intel Corporation
+# Copyright (c) 2018-2023 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -54,7 +54,26 @@ echo "    NUMA_CORES_PER_INSTANCE: ${NUMA_CORES_PER_INSTANCE}"
 echo "    PYTHON_EXE: ${PYTHON_EXE}"
 echo "    PYTHONPATH: ${PYTHONPATH}"
 echo "    DRY_RUN: ${DRY_RUN}"
-
+echo "    GPU: ${GPU}"
+
+#  Enable GPU Flag
+gpu_arg=""
+is_model_gpu_supported="False"
+if [ ${GPU} == "True" ]; then
+  gpu_arg="--gpu"
+  # Environment variables for GPU
+  export RenderCompressedBuffersEnabled=0
+  export CreateMultipleSubDevices=1
+  export ForceLocalMemoryAccessMode=1
+  export SYCL_PI_LEVEL_ZERO_BATCH_SIZE=1
+else
+  unset RenderCompressedBuffersEnabled
+  unset CreateMultipleSubDevices
+  unset ForceLocalMemoryAccessMode
+  unset ForceNonSystemMemoryPlacement
+  unset TF_ENABLE_LAYOUT_OPT
+  unset SYCL_PI_LEVEL_ZERO_BATCH_SIZE
+fi
 #  inference & training is supported right now
 if [ ${MODE} != "inference" ] && [ ${MODE} != "training" ]; then
   echo "${MODE} mode for ${MODEL_NAME} is not supported"
@@ -326,6 +345,10 @@ export PYTHONPATH=${PYTHONPATH}:${MOUNT_INTELAI_MODELS_COMMON_SOURCE}:${MOUNT_IN
 
 # Common execution command used by all models
 function run_model() {
+  if [ ${is_model_gpu_supported} == "False"  ] && [ ${GPU} == "True" ]; then
+    echo "Runing ${MODEL_NAME} ${MODE} with precision ${PRECISION} does not support --gpu."
+    exit 1
+  fi
   # Navigate to the main benchmark directory before executing the script,
   # since the scripts use the benchmark/common scripts as well.
   cd ${MOUNT_BENCHMARK}
@@ -390,7 +413,8 @@ ${benchmark_only_arg} \
 ${output_results_arg} \
 ${weight_sharing_arg} \
 ${synthetic_data_arg} \
-${verbose_arg}"
+${verbose_arg} \
+${gpu_arg}"
 
 if [ ${MOUNT_EXTERNAL_MODELS_SOURCE} != "None" ]; then
   CMD="${CMD} --model-source-dir=${MOUNT_EXTERNAL_MODELS_SOURCE}"
@@ -978,6 +1002,7 @@ function resnet101_inceptionv3() {
 # ResNet50  model
 function resnet50() {
     export PYTHONPATH=${PYTHONPATH}:$(pwd):${MOUNT_BENCHMARK}
+    is_model_gpu_supported="True"
 
     # For accuracy, dataset location is required.
     if [ "${DATASET_LOCATION_VOL}" == "None" ] && [ ${ACCURACY_ONLY} == "True" ]; then
@@ -1062,6 +1087,7 @@ function rfcn() {
 
 # SSD-MobileNet model
 function ssd_mobilenet() {
+  is_model_gpu_supported="True"
   if [ ${PRECISION} == "fp32" ] || [ ${PRECISION} == "bfloat16" ]; then
     if [ ${BATCH_SIZE} != "-1" ]; then
       echo "Warning: SSD-MobileNet FP32 inference script does not use the batch_size arg"
@@ -1404,7 +1430,21 @@ function wavenet() {
 
 # BERT base
 function bert_base() {
-  if [ ${PRECISION} == "fp32" ]  || [ $PRECISION == "bfloat16" ]; then
+  if [ ${GPU} == "True" ]; then
+    if [ ${MODE} == "inference" ]; then
+      echo "PRECISION=${PRECISION} on GPU not supported for ${MODEL_NAME} ${MODE} in this repo."
+      exit 1
+    elif [ ${MODE} == "training" ]; then
+      if [ ${PRECISION} != "fp32" ] && [ ${PRECISION} != "bfloat16" ]; then
+        echo "PRECISION=${PRECISION} on GPU not supported for ${MODEL_NAME} ${MODE} in this repo."
+        exit 1
+      fi
+    fi
+    is_model_gpu_supported="True"
+    export PYTHONPATH=${PYTHONPATH}:${MOUNT_EXTERNAL_MODELS_SOURCE}
+    bert_options
+    CMD=${CMD} run_model
+  elif [ ${PRECISION} == "fp32" ]  || [ $PRECISION == "bfloat16" ]; then
     export PYTHONPATH=${PYTHONPATH}:${MOUNT_EXTERNAL_MODELS_SOURCE}
     bert_options
     CMD=${CMD} run_model
@@ -1416,11 +1456,58 @@ function bert_base() {
 
 # BERT Large model
 function bert_large() {
-    # Change if to support fp32
-    if [ ${PRECISION} == "fp32" ]  || [ $PRECISION == "int8" ] || [ $PRECISION == "bfloat16" ] || [ $PRECISION == "fp16" ]; then
+    export PYTHONPATH=${PYTHONPATH}:${MOUNT_BENCHMARK}
+    if [ ${GPU} == "True" ]; then
+      if [ ${MODE} == "inference" ]; then
+        if [ ${PRECISION} != "fp32" ] && [ ${PRECISION} != "fp16" ] && [ ${PRECISION} != "bfloat16" ]; then
+          echo "PRECISION=${PRECISION} on GPU not supported for ${MODEL_NAME} ${MODE} in this repo."
+          exit 1
+        fi
+      elif [ ${MODE} == "training" ]; then
+        if [ ${PRECISION} != "fp32" ] && [ ${PRECISION} != "bfloat16" ]; then
+          echo "PRECISION=${PRECISION} on GPU not supported for ${MODEL_NAME} ${MODE} in this repo."
+          exit 1
+        fi
+      fi
+      is_model_gpu_supported="True"
       export PYTHONPATH=${PYTHONPATH}:${MOUNT_EXTERNAL_MODELS_SOURCE}
       bert_options
       CMD=${CMD} run_model
+    else
+      if [ ${PRECISION} == "fp32" ]  || [ $PRECISION == "int8" ] || [ $PRECISION == "bfloat16" ] || [ $PRECISION == "fp16" ]; then
+        export PYTHONPATH=${PYTHONPATH}:${MOUNT_EXTERNAL_MODELS_SOURCE}
+        bert_options
+        CMD=${CMD} run_model
+      else
+        echo "PRECISION=${PRECISION} not supported for ${MODEL_NAME} in this repo."
+        exit 1
+      fi
+    fi
+}
+
+# distilBERT base model
+function distilbert_base() {
+    if [ ${PRECISION} == "fp32" ] || [ ${PRECISION} == "bfloat16" ]|| [ ${PRECISION} == "int8" ]; then
+      export PYTHONPATH=${PYTHONPATH}:${MOUNT_EXTERNAL_MODELS_SOURCE}
+      CMD="${CMD} $(add_arg "--warmup-steps" ${WARMUP_STEPS})"
+      CMD="${CMD} $(add_arg "--steps" ${STEPS})"
+
+      if [ ${NUM_INTER_THREADS} != "None" ]; then
+        CMD="${CMD} $(add_arg "--num-inter-threads" ${NUM_INTER_THREADS})"
+      fi
+
+      if [ ${NUM_INTRA_THREADS} != "None" ]; then
+        CMD="${CMD} $(add_arg "--num-intra-threads" ${NUM_INTRA_THREADS})"
+      fi
+
+      if [ -z ${STEPS} ]; then
+        CMD="${CMD} $(add_arg "--steps" ${STEPS})"
+      fi
+
+      if [ -z $MAX_SEQ_LENGTH ]; then
+        CMD="${CMD} $(add_arg "--max-seq-length" ${MAX_SEQ_LENGTH})"
+      fi
+      CMD=${CMD} run_model
     else
       echo "PRECISION=${PRECISION} not supported for ${MODEL_NAME} in this repo."
       exit 1
diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/model_init.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/model_init.py
index 15a46531c..d078ae0b3 100644
--- a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/model_init.py
+++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/model_init.py
@@ -65,7 +65,8 @@ def __init__(self, args, custom_args=[], platform_util=None):
         config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json")
         self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime))
 
-        set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
+        if not self.args.gpu:
+            set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
 
         # If weight-sharing flag is ON, then use the weight-sharing script.
         if self.args.weight_sharing and not self.args.accuracy_only:
@@ -73,9 +74,14 @@ def __init__(self, args, custom_args=[], platform_util=None):
                 self.args.intelai_models, self.args.mode,
                 "eval_image_classifier_inference_weight_sharing.py")
         else:
-            benchmark_script = os.path.join(
-                self.args.intelai_models, self.args.mode,
-                "eval_image_classifier_inference.py")
+            if self.args.gpu:
+                benchmark_script = os.path.join(
+                    self.args.intelai_models, self.args.mode, self.args.precision,
+                    "eval_image_classifier_inference.py")
+            else:
+                benchmark_script = os.path.join(
+                    self.args.intelai_models, self.args.mode,
+                    "eval_image_classifier_inference.py")
 
         self.benchmark_command = self.get_command_prefix(args.socket_id) + \
             self.python_exe + " " + benchmark_script
@@ -83,15 +89,24 @@ def __init__(self, args, custom_args=[], platform_util=None):
         num_cores = self.platform_util.num_cores_per_socket if self.args.num_cores == -1 \
             else self.args.num_cores
 
-        self.benchmark_command = \
-            self.benchmark_command + \
-            " --input-graph=" + self.args.input_graph + \
-            " --num-inter-threads=" + str(self.args.num_inter_threads) + \
-            " --num-intra-threads=" + str(self.args.num_intra_threads) + \
-            " --num-cores=" + str(num_cores) + \
-            " --batch-size=" + str(self.args.batch_size) + \
-            " --warmup-steps=" + str(self.args.warmup_steps) + \
-            " --steps=" + str(self.args.steps)
+        if self.args.gpu:
+            self.benchmark_command = \
+                self.benchmark_command + \
+                " --input-graph=" + self.args.input_graph + \
+                " --num-cores=" + str(num_cores) + \
+                " --batch-size=" + str(self.args.batch_size) + \
+                " --warmup-steps=" + str(self.args.warmup_steps) + \
+                " --steps=" + str(self.args.steps)
+        else:
+            self.benchmark_command = \
+                self.benchmark_command + \
+                " --input-graph=" + self.args.input_graph + \
+                " --num-inter-threads=" + str(self.args.num_inter_threads) + \
+                " --num-intra-threads=" + str(self.args.num_intra_threads) + \
+                " --num-cores=" + str(num_cores) + \
+                " --batch-size=" + str(self.args.batch_size) + \
+                " --warmup-steps=" + str(self.args.warmup_steps) + \
+                " --steps=" + str(self.args.steps)
 
         if self.args.data_num_inter_threads:
             self.benchmark_command += " --data-num-inter-threads=" + str(self.args.data_num_inter_threads)
diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp16/__init__.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp16/__init__.py
index eb0564d39..f07dbaf88 100644
--- a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp16/__init__.py
+++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp16/__init__.py
@@ -15,3 +15,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #
+
+#
diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp16/model_init.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp16/model_init.py
index 9687af6c3..edc52b926 100644
--- a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp16/model_init.py
+++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp16/model_init.py
@@ -65,17 +65,22 @@ def __init__(self, args, custom_args=[], platform_util=None):
         config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json")
         self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime))
 
-        set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
-
         # If weight-sharing flag is ON, then use the weight-sharing script.
         if self.args.weight_sharing and not self.args.accuracy_only:
+            set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
             benchmark_script = os.path.join(
                 self.args.intelai_models, self.args.mode,
                 "eval_image_classifier_inference_weight_sharing.py")
         else:
-            benchmark_script = os.path.join(
-                self.args.intelai_models, self.args.mode,
-                "eval_image_classifier_inference.py")
+            if self.args.gpu:
+                benchmark_script = os.path.join(
+                    self.args.intelai_models, self.args.mode, self.args.precision,
+                    "eval_image_classifier_inference.py")
+            else:
+                set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
+                benchmark_script = os.path.join(
+                    self.args.intelai_models, self.args.mode,
+                    "eval_image_classifier_inference.py")
 
         self.benchmark_command = self.get_command_prefix(args.socket_id) + \
             self.python_exe + " " + benchmark_script
@@ -83,16 +88,25 @@ def __init__(self, args, custom_args=[], platform_util=None):
         num_cores = self.platform_util.num_cores_per_socket if self.args.num_cores == -1 \
             else self.args.num_cores
 
-        self.benchmark_command = \
-            self.benchmark_command + \
-            " --input-graph=" + self.args.input_graph + \
-            " --data-type=" + self.args.precision + \
-            " --num-inter-threads=" + str(self.args.num_inter_threads) + \
-            " --num-intra-threads=" + str(self.args.num_intra_threads) + \
-            " --num-cores=" + str(num_cores) + \
-            " --batch-size=" + str(self.args.batch_size) + \
-            " --warmup-steps=" + str(self.args.warmup_steps) + \
-            " --steps=" + str(self.args.steps)
+        if self.args.gpu:
+            self.benchmark_command = \
+                self.benchmark_command + \
+                " --input-graph=" + self.args.input_graph + \
+                " --num-cores=" + str(num_cores) + \
+                " --batch-size=" + str(self.args.batch_size) + \
+                " --warmup-steps=" + str(self.args.warmup_steps) + \
+                " --steps=" + str(self.args.steps)
+        else:
+            self.benchmark_command = \
+                self.benchmark_command + \
+                " --input-graph=" + self.args.input_graph + \
+                " --data-type=" + self.args.precision + \
+                " --num-inter-threads=" + str(self.args.num_inter_threads) + \
+                " --num-intra-threads=" + str(self.args.num_intra_threads) + \
+                " --num-cores=" + str(num_cores) + \
+                " --batch-size=" + str(self.args.batch_size) + \
+                " --warmup-steps=" + str(self.args.warmup_steps) + \
+                " --steps=" + str(self.args.steps)
 
         if self.args.data_num_inter_threads:
             self.benchmark_command += " --data-num-inter-threads=" + str(self.args.data_num_inter_threads)
diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/model_init.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/model_init.py
index f4e958c75..29f0b58d1 100644
--- a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/model_init.py
+++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/fp32/model_init.py
@@ -1,7 +1,7 @@
 #
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2019 Intel Corporation
+# Copyright (c) 2019-2021 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -65,11 +65,15 @@ def __init__(self, args, custom_args=[], platform_util=None):
         config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json")
         self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime))
 
-        set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
-
-        benchmark_script = os.path.join(
-            self.args.intelai_models, self.args.mode,
-            "eval_image_classifier_inference.py")
+        if self.args.gpu:
+            benchmark_script = os.path.join(
+                self.args.intelai_models, self.args.mode, self.args.precision,
+                "eval_image_classifier_inference.py")
+        else:
+            set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
+            benchmark_script = os.path.join(
+                self.args.intelai_models, self.args.mode,
+                "eval_image_classifier_inference.py")
 
         self.benchmark_command = self.get_command_prefix(args.socket_id) + \
             self.python_exe + " " + benchmark_script
@@ -77,15 +81,24 @@ def __init__(self, args, custom_args=[], platform_util=None):
         num_cores = self.platform_util.num_cores_per_socket if self.args.num_cores == -1 \
             else self.args.num_cores
 
-        self.benchmark_command = \
-            self.benchmark_command + \
-            " --input-graph=" + self.args.input_graph + \
-            " --num-inter-threads=" + str(self.args.num_inter_threads) + \
-            " --num-intra-threads=" + str(self.args.num_intra_threads) + \
-            " --num-cores=" + str(num_cores) + \
-            " --batch-size=" + str(self.args.batch_size) + \
-            " --warmup-steps=" + str(self.args.warmup_steps) + \
-            " --steps=" + str(self.args.steps)
+        if self.args.gpu:
+            self.benchmark_command = \
+                self.benchmark_command + \
+                " --input-graph=" + self.args.input_graph + \
+                " --num-cores=" + str(num_cores) + \
+                " --batch-size=" + str(self.args.batch_size) + \
+                " --warmup-steps=" + str(self.args.warmup_steps) + \
+                " --steps=" + str(self.args.steps)
+        else:
+            self.benchmark_command = \
+                self.benchmark_command + \
+                " --input-graph=" + self.args.input_graph + \
+                " --num-inter-threads=" + str(self.args.num_inter_threads) + \
+                " --num-intra-threads=" + str(self.args.num_intra_threads) + \
+                " --num-cores=" + str(num_cores) + \
+                " --batch-size=" + str(self.args.batch_size) + \
+                " --warmup-steps=" + str(self.args.warmup_steps) + \
+                " --steps=" + str(self.args.steps)
 
         if self.args.data_num_inter_threads:
             self.benchmark_command += " --data-num-inter-threads=" + str(self.args.data_num_inter_threads)
diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/model_init.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/model_init.py
index 03891cffd..250da4cbd 100644
--- a/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/model_init.py
+++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/int8/model_init.py
@@ -1,7 +1,7 @@
 #
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2019 Intel Corporation
+# Copyright (c) 2019-2021 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -68,7 +68,8 @@ def parse_args(self):
         config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json")
         self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime))
 
-        set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
+        if not self.args.gpu:
+            set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
 
     def run_benchmark_or_accuracy(self):
         # If weight-sharing flag is ON, then use the weight-sharing script.
@@ -77,18 +78,29 @@ def run_benchmark_or_accuracy(self):
                 self.args.intelai_models, self.args.mode,
                 "eval_image_classifier_inference_weight_sharing.py")
         else:
-            cmd = os.path.join(
-                self.args.intelai_models, self.args.mode,
-                "eval_image_classifier_inference.py")
+            if self.args.gpu:
+                cmd = os.path.join(
+                    self.args.intelai_models, self.args.mode, self.args.precision,
+                    "eval_image_classifier_inference.py")
+            else:
+                cmd = os.path.join(
+                    self.args.intelai_models, self.args.mode,
+                    "eval_image_classifier_inference.py")
 
         cmd = self.get_command_prefix(self.args.socket_id) + self.python_exe + " " + cmd
 
-        cmd += " --input-graph=" + self.args.input_graph + \
-               " --num-inter-threads=" + str(self.args.num_inter_threads) + \
-               " --num-intra-threads=" + str(self.args.num_intra_threads) + \
-               " --batch-size=" + str(self.args.batch_size) + \
-               " --warmup-steps=" + str(self.args.warmup_steps) + \
-               " --steps=" + str(self.args.steps)
+        if self.args.gpu:
+            cmd += " --input-graph=" + self.args.input_graph + \
+                   " --batch-size=" + str(self.args.batch_size) + \
+                   " --warmup-steps=" + str(self.args.warmup_steps) + \
+                   " --steps=" + str(self.args.steps)
+        else:
+            cmd += " --input-graph=" + self.args.input_graph + \
+                   " --num-inter-threads=" + str(self.args.num_inter_threads) + \
+                   " --num-intra-threads=" + str(self.args.num_intra_threads) + \
+                   " --batch-size=" + str(self.args.batch_size) + \
+                   " --warmup-steps=" + str(self.args.warmup_steps) + \
+                   " --steps=" + str(self.args.steps)
 
         if self.args.calibrate:
             cmd += " --calibrate=" + str(self.args.calibrate)
diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/training/common_resnet50/resnet50_model_init.py b/benchmarks/image_recognition/tensorflow/resnet50v1_5/training/common_resnet50/resnet50_model_init.py
index d16a65bd4..97e08912d 100644
--- a/benchmarks/image_recognition/tensorflow/resnet50v1_5/training/common_resnet50/resnet50_model_init.py
+++ b/benchmarks/image_recognition/tensorflow/resnet50v1_5/training/common_resnet50/resnet50_model_init.py
@@ -66,7 +66,8 @@ def __init__(self, args, custom_args=[], platform_util=None):
         config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json")
         self.set_kmp_vars(config_file_path, kmp_blocktime=str(self.args.kmp_blocktime))
 
-        set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
+        if not self.args.gpu:
+            set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
 
         benchmark_script = os.path.join(
             self.args.intelai_models, self.args.mode,
@@ -84,17 +85,28 @@ def __init__(self, args, custom_args=[], platform_util=None):
             self.python_exe + " " + benchmark_script
 
         # Model requires random_seed. Just setting it to a random value.
-        random_seed = 2
-        self.benchmark_command = \
-            self.benchmark_command + \
-            " " + str(random_seed) + \
-            " --batch_size=" + str(self.args.batch_size) + \
-            " --max_train_steps=" + str(self.args.steps) + \
-            " --train_epochs=" + str(self.args.trainepochs) + \
-            " --epochs_between_evals=" + str(self.args.epochsbtwevals) + \
-            " --inter_op_parallelism_threads " + str(self.args.num_inter_threads) + \
-            " --intra_op_parallelism_threads " + str(self.args.num_intra_threads) + \
-            " --version 1 --resnet_size 50 --data_format=channels_last"
+        if self.args.gpu:
+            random_seed = 1
+            self.benchmark_command = \
+                self.benchmark_command + \
+                " " + str(random_seed) + \
+                " --batch_size=" + str(self.args.batch_size) + \
+                " --max_train_steps=" + str(self.args.steps) + \
+                " --train_epochs=" + str(self.args.trainepochs) + \
+                " --epochs_between_evals=" + str(self.args.epochsbtwevals) + \
+                " --num_gpus 1 --stop_threshold 0.75 --version 1 --resnet_size 50"
+        else:
+            random_seed = 2
+            self.benchmark_command = \
+                self.benchmark_command + \
+                " " + str(random_seed) + \
+                " --batch_size=" + str(self.args.batch_size) + \
+                " --max_train_steps=" + str(self.args.steps) + \
+                " --train_epochs=" + str(self.args.trainepochs) + \
+                " --epochs_between_evals=" + str(self.args.epochsbtwevals) + \
+                " --inter_op_parallelism_threads " + str(self.args.num_inter_threads) + \
+                " --intra_op_parallelism_threads " + str(self.args.num_intra_threads) + \
+                " --version 1 --resnet_size 50 --data_format=channels_last"
 
         # if the data location and checkpoint directory is not empty, then include the arg
         if self.args.data_location and os.listdir(self.args.data_location):
diff --git a/benchmarks/language_modeling/tensorflow/bert_large/inference/fp16/model_init.py b/benchmarks/language_modeling/tensorflow/bert_large/inference/fp16/model_init.py
index 555e782e5..7b2b0fa58 100644
--- a/benchmarks/language_modeling/tensorflow/bert_large/inference/fp16/model_init.py
+++ b/benchmarks/language_modeling/tensorflow/bert_large/inference/fp16/model_init.py
@@ -101,33 +101,48 @@ def __init__(self, args, custom_args=[], platform_util=None):
         if self.args.predict_file and not os.path.isabs(self.args.predict_file):
             self.args.predict_file = os.path.join(self.args.data_location, self.args.predict_file)
 
-        if self.args.init_checkpoint and not os.path.isabs(self.args.init_checkpoint):
-            self.args.init_checkpoint = os.path.join(self.args.checkpoint, self.args.init_checkpoint)
+        if not self.args.gpu:
+            if self.args.init_checkpoint and not os.path.isabs(self.args.init_checkpoint):
+                self.args.init_checkpoint = os.path.join(self.args.checkpoint, self.args.init_checkpoint)
 
-        # set default inter/intra threads
-        self.set_num_inter_intra_threads()
+            # set default inter/intra threads
+            self.set_num_inter_intra_threads()
 
-        if not os.getenv("OMP_NUM_THREADS"):
-            if self.args.num_intra_threads:
-                set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
-            else:
-                set_env_var("OMP_NUM_THREADS", platform_util.num_cores_per_socket)
+            if not os.getenv("OMP_NUM_THREADS"):
+                if self.args.num_intra_threads:
+                    set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
+                else:
+                    set_env_var("OMP_NUM_THREADS", platform_util.num_cores_per_socket)
 
         model_script = os.path.join(
-            self.args.intelai_models, self.args.mode,
-            "run_squad.py")
-
-        model_args = " --init_checkpoint=" + str(self.args.init_checkpoint) + \
-                     " --vocab_file=" + str(self.args.vocab_file) + \
-                     " --bert_config_file=" + str(self.args.bert_config_file) + \
-                     " --predict_file=" + str(self.args.predict_file) + \
-                     " --precision=" + str(self.args.precision) + \
-                     " --output_dir=" + str(self.args.output_dir) + \
-                     " --predict_batch_size=" + str(self.args.batch_size) + \
-                     " --experimental_gelu=" + str(self.args.experimental_gelu) + \
-                     " --optimized_softmax=" + str(self.args.optimized_softmax) + \
-                     " --amp=" + str(self.args.amp) + \
-                     " --do_predict=True "
+            self.args.intelai_models, self.args.mode, "run_squad.py")
+
+        if self.args.gpu:
+            if self.args.num_inter_threads:
+                set_env_var("TF_NUM_INTEROP_THREADS", self.args.num_inter_threads)
+            else:
+                set_env_var("TF_NUM_INTEROP_THREADS", 1)
+            model_args = " --vocab_file=" + str(self.args.vocab_file) + \
+                         " --bert_config_file=" + str(self.args.bert_config_file) + \
+                         " --predict_file=" + str(self.args.predict_file) + \
+                         " --precision=" + str(self.args.precision) + \
+                         " --output_dir=" + str(self.args.output_dir) + \
+                         " --predict_batch_size=" + str(self.args.batch_size) + \
+                         " --experimental_gelu=" + str(self.args.experimental_gelu) + \
+                         " --optimized_softmax=" + str(self.args.optimized_softmax) + \
+                         " --do_predict=True "
+        else:
+            model_args = " --init_checkpoint=" + str(self.args.init_checkpoint) + \
+                         " --vocab_file=" + str(self.args.vocab_file) + \
+                         " --bert_config_file=" + str(self.args.bert_config_file) + \
+                         " --predict_file=" + str(self.args.predict_file) + \
+                         " --precision=" + str(self.args.precision) + \
+                         " --output_dir=" + str(self.args.output_dir) + \
+                         " --predict_batch_size=" + str(self.args.batch_size) + \
+                         " --experimental_gelu=" + str(self.args.experimental_gelu) + \
+                         " --optimized_softmax=" + str(self.args.optimized_softmax) + \
+                         " --amp=" + str(self.args.amp) + \
+                         " --do_predict=True "
 
         if self.args.input_graph:
             model_args += " --input_graph=" + str(self.args.input_graph)
@@ -153,16 +168,16 @@ def __init__(self, args, custom_args=[], platform_util=None):
         if self.args.num_intra_threads:
             model_args += " --intra_op_parallelism_threads=" + str(self.args.num_intra_threads)
 
-        if self.args.warmup_steps:
-            model_args += " --warmup_steps=" + str(self.args.warmup_steps)
-
-        if self.args.steps:
-            model_args += " --steps=" + str(self.args.steps)
-
         if self.args.weight_sharing:
             model_args += " --weight_sharing"
 
-        model_args += " --num_cores_per_socket=" + str(platform_util.num_cores_per_socket)
+        if not self.args.gpu:
+            model_args += " --num_cores_per_socket=" + str(platform_util.num_cores_per_socket)
+            if self.args.warmup_steps:
+                model_args += " --warmup_steps=" + str(self.args.warmup_steps)
+
+            if self.args.steps:
+                model_args += " --steps=" + str(self.args.steps)
 
         self.benchmark_command = self.get_command_prefix(args.socket_id) + \
             self.python_exe + " " + model_script + model_args
diff --git a/benchmarks/language_modeling/tensorflow/bert_large/inference/fp32/model_init.py b/benchmarks/language_modeling/tensorflow/bert_large/inference/fp32/model_init.py
index 75bcac089..40caa2cd1 100644
--- a/benchmarks/language_modeling/tensorflow/bert_large/inference/fp32/model_init.py
+++ b/benchmarks/language_modeling/tensorflow/bert_large/inference/fp32/model_init.py
@@ -106,8 +106,9 @@ def __init__(self, args, custom_args=[], platform_util=None):
         if self.args.predict_file and not os.path.isabs(self.args.predict_file):
             self.args.predict_file = os.path.join(self.args.data_location, self.args.predict_file)
 
-        if self.args.init_checkpoint and not os.path.isabs(self.args.init_checkpoint):
-            self.args.init_checkpoint = os.path.join(self.args.checkpoint, self.args.init_checkpoint)
+        if not self.args.gpu:
+            if self.args.init_checkpoint and not os.path.isabs(self.args.init_checkpoint):
+                self.args.init_checkpoint = os.path.join(self.args.checkpoint, self.args.init_checkpoint)
 
         # set default inter/intra threads
         self.set_num_inter_intra_threads()
@@ -121,17 +122,29 @@ def __init__(self, args, custom_args=[], platform_util=None):
         model_script = os.path.join(
             self.args.intelai_models, self.args.mode, "run_squad.py")
 
-        model_args = " --init_checkpoint=" + str(self.args.init_checkpoint) + \
-                     " --vocab_file=" + str(self.args.vocab_file) + \
-                     " --bert_config_file=" + str(self.args.bert_config_file) + \
-                     " --predict_file=" + str(self.args.predict_file) + \
-                     " --precision=" + str(self.args.precision) + \
-                     " --output_dir=" + str(self.args.output_dir) + \
-                     " --predict_batch_size=" + str(self.args.batch_size) + \
-                     " --experimental_gelu=" + str(self.args.experimental_gelu) + \
-                     " --optimized_softmax=" + str(self.args.optimized_softmax) + \
-                     " --input_graph=" + str(self.args.input_graph) + \
-                     " --do_predict=True "
+        if self.args.gpu:
+            model_args = " --vocab_file=" + str(self.args.vocab_file) + \
+                         " --bert_config_file=" + str(self.args.bert_config_file) + \
+                         " --predict_file=" + str(self.args.predict_file) + \
+                         " --precision=" + str(self.args.precision) + \
+                         " --output_dir=" + str(self.args.output_dir) + \
+                         " --predict_batch_size=" + str(self.args.batch_size) + \
+                         " --experimental_gelu=" + str(self.args.experimental_gelu) + \
+                         " --optimized_softmax=" + str(self.args.optimized_softmax) + \
+                         " --input_graph=" + str(self.args.input_graph) + \
+                         " --do_predict=True "
+        else:
+            model_args = " --init_checkpoint=" + str(self.args.init_checkpoint) + \
+                         " --vocab_file=" + str(self.args.vocab_file) + \
+                         " --bert_config_file=" + str(self.args.bert_config_file) + \
+                         " --predict_file=" + str(self.args.predict_file) + \
+                         " --precision=" + str(self.args.precision) + \
+                         " --output_dir=" + str(self.args.output_dir) + \
+                         " --predict_batch_size=" + str(self.args.batch_size) + \
+                         " --experimental_gelu=" + str(self.args.experimental_gelu) + \
+                         " --optimized_softmax=" + str(self.args.optimized_softmax) + \
+                         " --input_graph=" + str(self.args.input_graph) + \
+                         " --do_predict=True "
 
         if self.args.accuracy_only:
             model_args += " --mode=accuracy"
@@ -160,11 +173,12 @@ def __init__(self, args, custom_args=[], platform_util=None):
         if self.args.steps:
             model_args += " --steps=" + str(self.args.steps)
 
-        if self.args.weight_sharing:
-            model_args += " --weight_sharing"
-
         model_args += " --num_cores_per_socket=" + str(platform_util.num_cores_per_socket)
 
+        if not self.args.gpu:
+            if self.args.weight_sharing:
+                model_args += " --weight_sharing"
+
         self.benchmark_command = self.get_command_prefix(args.socket_id) + \
             self.python_exe + " " + model_script + model_args
 
diff --git a/benchmarks/language_modeling/tensorflow/bert_large/training/__init__.py b/benchmarks/language_modeling/tensorflow/bert_large/training/__init__.py
new file mode 100644
index 000000000..e2a8ccd87
--- /dev/null
+++ b/benchmarks/language_modeling/tensorflow/bert_large/training/__init__.py
@@ -0,0 +1,19 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: EPL-2.0
+#
diff --git a/benchmarks/language_modeling/tensorflow/bert_large/training/bfloat16/model_init.py b/benchmarks/language_modeling/tensorflow/bert_large/training/bfloat16/model_init.py
index d71beb3f5..a478423a9 100644
--- a/benchmarks/language_modeling/tensorflow/bert_large/training/bfloat16/model_init.py
+++ b/benchmarks/language_modeling/tensorflow/bert_large/training/bfloat16/model_init.py
@@ -1,7 +1,7 @@
 #
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2018 Intel Corporation
+# Copyright (c) 2021 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -108,7 +108,8 @@ def __init__(self, args, custom_args=[], platform_util=None):
         config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json")
         self.set_kmp_vars(config_file_path)
 
-        set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
+        if not self.args.gpu:
+            set_env_var("OMP_NUM_THREADS", self.args.num_intra_threads)
 
         run_script = "run_squad.py"
         if self.args.train_option == "Pretraining":
@@ -196,7 +197,7 @@ def __init__(self, args, custom_args=[], platform_util=None):
 
         if os.environ["MPI_NUM_PROCESSES"] == "None":
             self.benchmark_command = self.benchmark_command + self.python_exe + " " + benchmark_script + "\n"
-        else:
+        elif not self.args.gpu:
             numa_cmd = " -np 1 numactl -N {} -m {} "
             self.benchmark_command = self.benchmark_command + numa_cmd.format(0, 0) + os.environ["PYTHON_EXE"] + " " \
                 + benchmark_script
@@ -215,7 +216,7 @@ def __init__(self, args, custom_args=[], platform_util=None):
     def run(self):
         if self.benchmark_command:
             print("----------------------------Run command-------------------------------------")
-            print(self.benchmark_command)
+            print(self.benchmark_command, flush=True)
             print("------------------------------------------------------------------------")
             self.run_command(self.benchmark_command)
             if self.args.output_results:
diff --git a/benchmarks/language_modeling/tensorflow/bert_large/training/fp32/model_init.py b/benchmarks/language_modeling/tensorflow/bert_large/training/fp32/model_init.py
index 1fb4952db..9f1f8d521 100644
--- a/benchmarks/language_modeling/tensorflow/bert_large/training/fp32/model_init.py
+++ b/benchmarks/language_modeling/tensorflow/bert_large/training/fp32/model_init.py
@@ -1,7 +1,7 @@
 #
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2018 Intel Corporation
+# Copyright (c) 2021 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -94,6 +94,8 @@ def __init__(self, args, custom_args=[], platform_util=None):
         arg_parser.add_argument('--num-intra-threads', help=' Number of Intra ops threads', type=int,
                                 dest="num_intra_threads", default=self.args.num_inter_threads)
         arg_parser.add_argument('--profile', help=' Enable Tensorflow profiler hook', dest="profile", default="False")
+        arg_parser.add_argument('--optimized_softmax', help='optimized_softmax',
+                                dest="optimized_softmax", default="False")
         arg_parser.add_argument('--experimental-gelu', help=' [Experimental] Use experimental gelu op.',
                                 dest="experimental_gelu", default="False")
         arg_parser.add_argument('--mpi_workers_sync_gradients',
@@ -131,10 +133,14 @@ def __init__(self, args, custom_args=[], platform_util=None):
 
         # num_cores = self.platform_util.num_cores_per_socket if self.args.num_cores == -1 else self.args.num_cores
 
-        # data_location =str(self.args.data_location)
+        # data_location = str(self.args.data_location)
         # bert_large_data = data_location + "/wwm_cased_L-24_H-1024_A-16/"
         # bert_squad_data = data_location + "/SQuAD/"
-        # bert_glue_dir   = data_location + "glue/glue_data"
+        # bert_glue_dir = data_location + "glue/glue_data"
+
+        if self.args.gpu and self.args.train_option == "Pretraining":
+            self.args.num_intra_threads = 1
+            self.args.num_inter_threads = 1
 
         eoo = " \\\n"
         self.cmd_args = \
@@ -165,6 +171,10 @@ def __init__(self, args, custom_args=[], platform_util=None):
                 " --doc_stride=" + str(self.args.doc_stride)
 
         if self.args.train_option == "Pretraining":
+            # CreateMultipleSubDevices is set by default for GPU, it causes
+            # memory issues, hence removing it for Pretraining.
+            if 'CreateMultipleSubDevices' in os.environ:
+                del os.environ['CreateMultipleSubDevices']
             if self.args.init_checkpoint != '':
                 self.cmd_args = self.cmd_args + \
                     " --init_checkpoint=" + str(self.args.init_checkpoint) + eoo
@@ -172,6 +182,7 @@ def __init__(self, args, custom_args=[], platform_util=None):
                 " --input_file=" + str(self.args.input_file) + eoo + \
                 " --do_eval=" + str(self.args.do_eval) + eoo + \
                 " --num_train_steps=" + str(self.args.num_train_steps) + eoo + \
+                " --optimized_softmax=" + str(self.args.optimized_softmax) + eoo + \
                 " --num_warmup_steps=" + str(self.args.warmup_steps) + eoo + \
                 " --max_predictions_per_seq=" + str(self.args.max_predictions)
 
@@ -193,7 +204,7 @@ def __init__(self, args, custom_args=[], platform_util=None):
 
         if os.environ["MPI_NUM_PROCESSES"] == "None":
             self.benchmark_command = self.benchmark_command + self.python_exe + " " + benchmark_script + "\n"
-        else:
+        elif not self.args.gpu:
             numa_cmd = " -np 1 numactl -N {} -m {} "
             self.benchmark_command = self.benchmark_command + numa_cmd.format(0, 0) + os.environ["PYTHON_EXE"] + " " \
                 + benchmark_script
@@ -212,7 +223,7 @@ def __init__(self, args, custom_args=[], platform_util=None):
     def run(self):
         if self.benchmark_command:
             print("----------------------------Run command-------------------------------------")
-            print(self.benchmark_command)
+            print(self.benchmark_command, flush=True)
             print("------------------------------------------------------------------------")
             self.run_command(self.benchmark_command)
             if self.args.output_results:
diff --git a/benchmarks/launch_benchmark.py b/benchmarks/launch_benchmark.py
index 1454ccc06..405d453a8 100644
--- a/benchmarks/launch_benchmark.py
+++ b/benchmarks/launch_benchmark.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 # -*- coding: utf-8 -*-
 #
-# Copyright (c) 2018 Intel Corporation
+# Copyright (c) 2018-2021 Intel Corporation
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -241,7 +241,8 @@ def get_env_vars(self, benchmark_scripts, use_case, intelai_models,
             "USE_CASE": str(use_case),
             "VERBOSE": args.verbose,
             "WEIGHT_SHARING": args.weight_sharing,
-            "SYNTHETIC_DATA": args.synthetic_data
+            "SYNTHETIC_DATA": args.synthetic_data,
+            "GPU": str(args.gpu)
         }
 
         # Add custom model args as env vars)
diff --git a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/model_init.py b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/model_init.py
index 71baae975..45074e540 100644
--- a/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/model_init.py
+++ b/benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8/model_init.py
@@ -58,6 +58,7 @@ def __init__(self, args, custom_args=[], platform_util=None):
         else:
             # Did not support multi-batch accuracy check.
             self.command_prefix += " -b {0}".format(self.args.batch_size)
+            self.command_prefix += " --benchmark"
 
     def run(self):
         self.run_command(self.command_prefix)
diff --git a/dockerfiles/gpu_model_containers/pytorch-atsm-resnet50v1-5-inference.Dockerfile b/dockerfiles/gpu_model_containers/pytorch-atsm-resnet50v1-5-inference.Dockerfile
new file mode 100644
index 000000000..d5f51ca45
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/pytorch-atsm-resnet50v1-5-inference.Dockerfile
@@ -0,0 +1,71 @@
+# Copyright (c) 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG PYTORCH_BASE_IMAGE="intel/intel-extension-for-pytorch"
+ARG PYTORCH_BASE_TAG="gpu"
+
+FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="pytorch-atsm-resnet50v1-5-inference"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh --config=$HOME/cfg.txt\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/pytorch-atsm-ssd-mobilenet-inference.Dockerfile b/dockerfiles/gpu_model_containers/pytorch-atsm-ssd-mobilenet-inference.Dockerfile
new file mode 100644
index 000000000..15fa56f89
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/pytorch-atsm-ssd-mobilenet-inference.Dockerfile
@@ -0,0 +1,94 @@
+# Copyright (c) 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG PYTORCH_BASE_IMAGE="intel/intel-extension-for-pytorch"
+ARG PYTORCH_BASE_TAG="gpu"
+
+FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends --fix-missing numactl
+
+ARG PY_VERSION=3.9
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    python${PY_VERSION}-dev
+
+RUN pip install opencv-python
+
+# Note pycocotools has to be install after the other requirements
+RUN pip install \
+        Cython \
+        contextlib2 \
+        jupyter \
+        lxml \
+        matplotlib \
+        numpy>=1.17.4 \
+        'pillow>=9.3.0' && \
+    pip install pycocotools
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="pytorch-atsm-ssd-mobilenet-inference"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh --config=$HOME/cfg.txt\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/pytorch-atsm-yolov4-inference.Dockerfile b/dockerfiles/gpu_model_containers/pytorch-atsm-yolov4-inference.Dockerfile
new file mode 100644
index 000000000..9512b7a2c
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/pytorch-atsm-yolov4-inference.Dockerfile
@@ -0,0 +1,94 @@
+# Copyright (c) 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG PYTORCH_BASE_IMAGE="intel/intel-extension-for-pytorch"
+ARG PYTORCH_BASE_TAG="gpu"
+
+FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends --fix-missing numactl
+
+ARG PY_VERSION=3.9
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    python${PY_VERSION}-dev
+
+RUN pip install opencv-python
+
+# Note pycocotools has to be install after the other requirements
+RUN pip install \
+        Cython \
+        contextlib2 \
+        jupyter \
+        lxml \
+        matplotlib \
+        numpy>=1.17.4 \
+        'pillow>=9.3.0' && \
+    pip install pycocotools
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="pytorch-atsm-yolov4-inference"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh --config=$HOME/cfg.txt\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/pytorch-max-series-bert-large-inference.Dockerfile b/dockerfiles/gpu_model_containers/pytorch-max-series-bert-large-inference.Dockerfile
new file mode 100644
index 000000000..aaa575aaf
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/pytorch-max-series-bert-large-inference.Dockerfile
@@ -0,0 +1,80 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG PYTORCH_BASE_IMAGE="intel/intel-extension-for-pytorch"
+ARG PYTORCH_BASE_TAG="xpu-max"
+
+FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}
+
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="pytorch-max-series-bert-large-inference"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ARG PACKAGE_NAME="pytorch-max-series-bert-large-inference"
+ARG MODEL_WORKSPACE
+
+RUN cd ${MODEL_WORKSPACE}/${PACKAGE_NAME}/models/language_modeling/pytorch/bert_large/inference/gpu && \
+    pip install -r requirements.txt 
+    
+RUN cd -
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/pytorch-max-series-bert-large-training.Dockerfile b/dockerfiles/gpu_model_containers/pytorch-max-series-bert-large-training.Dockerfile
new file mode 100644
index 000000000..5ae315bcb
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/pytorch-max-series-bert-large-training.Dockerfile
@@ -0,0 +1,94 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG PYTORCH_BASE_IMAGE="intel/intel-extension-for-pytorch"
+ARG PYTORCH_BASE_TAG="xpu-max"
+
+FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}
+
+
+    
+RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | apt-key add -
+RUN echo "deb [trusted=yes] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    intel-oneapi-mpi-devel \
+    intel-oneapi-ccl \
+    && \
+  rm -rf /var/lib/apt/lists/*
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="pytorch-max-series-bert-large-training"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ARG PACKAGE_NAME="pytorch-max-series-bert-large-training"
+ARG MODEL_WORKSPACE
+    
+RUN pip install -r ${MODEL_WORKSPACE}/${PACKAGE_NAME}/models/language_modeling/pytorch/bert_large/training/gpu/requirements.txt
+
+RUN cd ${MODEL_WORKSPACE}/${PACKAGE_NAME}/models/language_modeling/pytorch/bert_large/training/gpu/data/ && \
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt && \
+    mv bert-base-uncased-vocab.txt vocab. && \
+    cd -
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/pytorch-max-series-resnet50v1-5-inference.Dockerfile b/dockerfiles/gpu_model_containers/pytorch-max-series-resnet50v1-5-inference.Dockerfile
new file mode 100644
index 000000000..7083fe234
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/pytorch-max-series-resnet50v1-5-inference.Dockerfile
@@ -0,0 +1,71 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG PYTORCH_BASE_IMAGE="intel/intel-extension-for-pytorch"
+ARG PYTORCH_BASE_TAG="xpu-max"
+
+FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="pytorch-max-series-resnet50v1-5-inference"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/pytorch-max-series-resnet50v1-5-training.Dockerfile b/dockerfiles/gpu_model_containers/pytorch-max-series-resnet50v1-5-training.Dockerfile
new file mode 100644
index 000000000..1f8277586
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/pytorch-max-series-resnet50v1-5-training.Dockerfile
@@ -0,0 +1,82 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG PYTORCH_BASE_IMAGE="intel/intel-extension-for-pytorch"
+ARG PYTORCH_BASE_TAG="xpu-max"
+
+FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}
+
+RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | apt-key add -
+RUN echo "deb [trusted=yes] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    intel-oneapi-mpi-devel \
+    intel-oneapi-ccl \
+    && \
+  rm -rf /var/lib/apt/lists/*
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="pytorch-max-series-resnet50v1-5-training"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/tf-atsm-resnet50v1-5-inference.Dockerfile b/dockerfiles/gpu_model_containers/tf-atsm-resnet50v1-5-inference.Dockerfile
new file mode 100644
index 000000000..69a87a487
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/tf-atsm-resnet50v1-5-inference.Dockerfile
@@ -0,0 +1,71 @@
+# Copyright (c) 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG TENSORFLOW_BASE_IMAGE="intel/intel-extension-for-tensorflow"
+ARG TENSORFLOW_BASE_TAG="gpu"
+
+FROM ${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG}
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="tf-atsm-resnet50v1-5-inference"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh --config=$HOME/cfg.txt\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/tf-atsm-ssd-mobilenet-inference.Dockerfile b/dockerfiles/gpu_model_containers/tf-atsm-ssd-mobilenet-inference.Dockerfile
new file mode 100644
index 000000000..f4816de2e
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/tf-atsm-ssd-mobilenet-inference.Dockerfile
@@ -0,0 +1,92 @@
+# Copyright (c) 2020-2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG TENSORFLOW_BASE_IMAGE="intel/intel-extension-for-tensorflow"
+ARG TENSORFLOW_BASE_TAG="gpu"
+
+FROM ${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG}
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends --fix-missing numactl
+
+ARG PY_VERSION=3.9
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends --fix-missing \
+    build-essential \
+    python${PY_VERSION}-dev
+
+# Note pycocotools has to be install after the other requirements
+RUN pip install \
+        Cython \
+        contextlib2 \
+        jupyter \
+        lxml \
+        matplotlib \
+        numpy>=1.17.4 \
+        'pillow>=9.3.0' && \
+    pip install pycocotools
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="tf-atsm-ssd-mobilenet-inference"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh --config=$HOME/cfg.txt\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/tf-max-series-bert-large-inference.Dockerfile b/dockerfiles/gpu_model_containers/tf-max-series-bert-large-inference.Dockerfile
new file mode 100644
index 000000000..d008ac648
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/tf-max-series-bert-large-inference.Dockerfile
@@ -0,0 +1,70 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG TENSORFLOW_BASE_IMAGE="intel/intel-extension-for-tensorflow"
+ARG TENSORFLOW_BASE_TAG="gpu-max"
+
+FROM ${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG}
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="tf-max-series-bert-large-inference"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/tf-max-series-bert-large-training.Dockerfile b/dockerfiles/gpu_model_containers/tf-max-series-bert-large-training.Dockerfile
new file mode 100644
index 000000000..e63861261
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/tf-max-series-bert-large-training.Dockerfile
@@ -0,0 +1,85 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG TENSORFLOW_BASE_IMAGE="intel/intel-extension-for-tensorflow"
+ARG TENSORFLOW_BASE_TAG="gpu-max"
+
+FROM ${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG}
+RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | apt-key add -
+RUN echo "deb [trusted=yes] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    intel-oneapi-mpi-devel \
+    intel-oneapi-ccl \
+    && \
+  rm -rf /var/lib/apt/lists/*
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="tf-max-series-bert-large-training"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ARG PACKAGE_NAME="tf-max-series-bert-large-training"
+ARG MODEL_WORKSPACE
+
+RUN git apply ${MODEL_WORKSPACE}/${PACKAGE_NAME}/quickstart/hvs_support.patch
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/tf-max-series-resnet50v1-5-inference.Dockerfile b/dockerfiles/gpu_model_containers/tf-max-series-resnet50v1-5-inference.Dockerfile
new file mode 100644
index 000000000..db0bbb34b
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/tf-max-series-resnet50v1-5-inference.Dockerfile
@@ -0,0 +1,71 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG TENSORFLOW_BASE_IMAGE="intel/intel-extension-for-tensorflow"
+ARG TENSORFLOW_BASE_TAG="gpu-max"
+
+FROM ${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG}
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="tf-max-series-resnet50v1-5-inference"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/dockerfiles/gpu_model_containers/tf-max-series-resnet50v1-5-training.Dockerfile b/dockerfiles/gpu_model_containers/tf-max-series-resnet50v1-5-training.Dockerfile
new file mode 100644
index 000000000..8f0e7ab55
--- /dev/null
+++ b/dockerfiles/gpu_model_containers/tf-max-series-resnet50v1-5-training.Dockerfile
@@ -0,0 +1,82 @@
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+#
+# THIS IS A GENERATED DOCKERFILE.
+#
+# This file was assembled from multiple pieces, whose use is documented
+# throughout. Please refer to the TensorFlow dockerfiles documentation
+# for more information.
+
+ARG TENSORFLOW_BASE_IMAGE="intel/intel-extension-for-tensorflow"
+ARG TENSORFLOW_BASE_TAG="gpu-max"
+
+FROM ${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG}
+
+RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | apt-key add -
+RUN echo "deb [trusted=yes] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    intel-oneapi-mpi-devel \
+    intel-oneapi-ccl \
+    && \
+  rm -rf /var/lib/apt/lists/*
+
+ARG PACKAGE_DIR=model_packages
+
+ARG PACKAGE_NAME="tf-max-series-resnet50v1-5-training"
+
+ARG MODEL_WORKSPACE
+
+# ${MODEL_WORKSPACE} and below needs to be owned by root:root rather than the current UID:GID
+# this allows the default user (root) to work in k8s single-node, multi-node
+RUN umask 002 && mkdir -p ${MODEL_WORKSPACE} && chgrp root ${MODEL_WORKSPACE} && chmod g+s+w,o+s+r ${MODEL_WORKSPACE}
+
+ADD --chown=0:0 ${PACKAGE_DIR}/${PACKAGE_NAME}.tar.gz ${MODEL_WORKSPACE}
+
+RUN chown -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chgrp -R root ${MODEL_WORKSPACE}/${PACKAGE_NAME} && chmod -R g+s+w ${MODEL_WORKSPACE}/${PACKAGE_NAME} && find ${MODEL_WORKSPACE}/${PACKAGE_NAME} -type d | xargs chmod o+r+x 
+
+WORKDIR ${MODEL_WORKSPACE}/${PACKAGE_NAME}
+
+ENV USER_ID=0
+
+ENV USER_NAME=root
+
+ENV GROUP_ID=0
+
+ENV GROUP_NAME=root
+
+RUN apt-get update && \
+    apt-get install --no-install-recommends --fix-missing -y gosu
+
+RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh\n\
+USER_ID=$USER_ID\n\
+USER_NAME=$USER_NAME\n\
+GROUP_ID=$GROUP_ID\n\
+GROUP_NAME=$GROUP_NAME\n\
+if [[ $GROUP_NAME != root ]]; then\n\
+  groupadd -r -g $GROUP_ID $GROUP_NAME\n\
+fi\n\
+if [[ $USER_NAME != root ]]; then\n\
+  useradd --no-log-init -r -u $USER_ID -g $GROUP_NAME -s /bin/bash -M $USER_NAME\n\
+fi\n\
+exec /usr/sbin/gosu $USER_NAME:$GROUP_NAME "$@"\n '\
+>> /tmp/entrypoint.sh
+
+RUN chmod u+x,g+x /tmp/entrypoint.sh
+
+ENTRYPOINT ["/tmp/entrypoint.sh"]
diff --git a/docs/general/FLEX_DEVCATALOG.md b/docs/general/FLEX_DEVCATALOG.md
new file mode 100644
index 000000000..9a0dd335c
--- /dev/null
+++ b/docs/general/FLEX_DEVCATALOG.md
@@ -0,0 +1,24 @@
+# Model Zoo for Intel® Architecture Workloads Optimized for the Intel® Data Center GPU Flex Series
+
+This document provides links to step-by-step instructions on how to leverage Model Zoo docker containers to run optimized open-source Deep Learning inference workloads using Intel® Extension for PyTorch* and Intel® Extension for TensorFlow* on the [Intel® Data Center GPU Flex Series](https://www.intel.com/content/www/us/en/products/docs/discrete-gpus/data-center-gpu/flex-series/overview.html).
+
+## Base Containers
+
+| AI Framework                 | Extension            | Documentation |
+| -----------------------------| ------------- | ----------------- |
+| PyTorch | Intel® Extension for PyTorch* | [Intel® Extension for PyTorch Container](https://github.com/IntelAI/models/blob/master/quickstart/ipex-tool-container/gpu/devcatalog.md) |
+| TensorFlow | Intel® Extension for TensorFlow* | [Intel® Extension for TensorFlow Container](https://github.com/IntelAI/models/blob/master/quickstart/tf-tool-container/gpu/devcatalog.md)|
+
+## Optimized Workloads
+
+The table below provides links to run each workload in a docker container. The containers are optimized for Linux*. 
+
+
+| Model                            | Framework                  | Mode  |   Documentation |  Dataset |
+| ----------------------------|     ---------- | ----------| ------------------- | ------------ |
+| [ResNet 50 v1.5](https://github.com/tensorflow/models/tree/v2.11.0/official/legacy/image_classification/resnet) | TensorFlow | Inference| [INT8](https://github.com/IntelAI/models/blob/master/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/devcatalog.md) | [ImageNet 2012](https://github.com/IntelAI/models/tree/master/datasets/imagenet/README.md) |
+| [ResNet 50 v1.5](https://arxiv.org/pdf/1512.03385.pdf) | PyTorch | Inference | [INT8 ](https://github.com/IntelAI/models/blob/master/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/devcatalog.md) | [ImageNet 2012](https://github.com/IntelAI/models/tree/master/datasets/imagenet/README.md) |
+| [SSD-MobileNet v1](https://arxiv.org/pdf/1704.04861.pdf) | PyTorch | Inference | [INT8](https://github.com/IntelAI/models/blob/master/quickstart/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/devcatalog.md) | [COCO 2017](https://github.com/IntelAI/models/blob/master/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/README.md#datasets)  |
+| [YOLO v4](https://arxiv.org/pdf/1704.04861.pdf) | PyTorch | Inference |[INT8](https://github.com/IntelAI/models/blob/master/quickstart/object_detection/pytorch/yolov4/inference/gpu/devcatalog.md) | [COCO 2017](https://github.com/IntelAI/models/blob/master/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/README.md#datasets) |
+| [SSD-MobileNet](https://arxiv.org/pdf/1704.04861.pdf) | TensorFlow | Inference | [INT8](https://github.com/IntelAI/models/blob/master/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/devcatalog.md)| [COCO 2017 validation dataset](https://github.com/IntelAI/models/tree/master/datasets/coco#download-and-preprocess-the-coco-validation-images) |
+
diff --git a/models/image_recognition/pytorch/resnet50v1_5/inference/gpu/__init__.py b/models/image_recognition/pytorch/resnet50v1_5/inference/gpu/__init__.py
new file mode 100644
index 000000000..5924359a9
--- /dev/null
+++ b/models/image_recognition/pytorch/resnet50v1_5/inference/gpu/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
diff --git a/models/image_recognition/pytorch/resnet50v1_5/inference/gpu/main.py b/models/image_recognition/pytorch/resnet50v1_5/inference/gpu/main.py
new file mode 100644
index 000000000..af229fd26
--- /dev/null
+++ b/models/image_recognition/pytorch/resnet50v1_5/inference/gpu/main.py
@@ -0,0 +1,1059 @@
+#
+# ****************************************************************************
+# Copyright 2019-2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ****************************************************************************
+
+# ****************************************************************************
+# BSD 3-Clause License
+# # This model is originally from the PyTorch Examples repo
+# (https://github.com/pytorch/examples/blob/master/imagenet/main.py)
+# ****************************************************************************
+
+# ****************************************************************************
+# Copyright (c) 2017,
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ****************************************************************************
+
+import argparse
+import os
+import sys
+import random
+import shutil
+import time
+import warnings
+from enum import Enum
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+from torch.optim.lr_scheduler import StepLR
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+from torch.utils.data import Subset
+import math
+
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+
+model_names = sorted(name for name in models.__dict__
+    if name.islower() and not name.startswith("__")
+    and callable(models.__dict__[name]))
+
+cwd = os.path.dirname(os.path.abspath(__file__))
+hub = os.path.expanduser("~/.cache/torch/intel")
+if not os.path.exists(hub):
+    os.makedirs(hub)
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet',
+                    help='path to dataset (default: imagenet)')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
+                    choices=model_names,
+                    help='model architecture: ' +
+                        ' | '.join(model_names) +
+                        ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=1, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='127.0.0.1', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-port', default='29500', type=str,
+                    help='url port used to set up distributed training')
+parser.add_argument('--dist-backend', default='ccl', type=str,
+                    help='distributed backend, default is torch-ccl')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--xpu', default=None, type=int,
+                    help='XPU id to use.')
+parser.add_argument('--tf32', default=0, type=int, help='Datatype used: TF32')
+parser.add_argument('--bf32', default=0, type=int, help='Datatype used: BF32')
+parser.add_argument('--fp16', default=0, type=int, help='Datatype used: FP16')
+parser.add_argument('--bf16', default=0, type=int, help='Datatype used: BF16')
+parser.add_argument('--int8', default=0, type=int, help='Use int8 quantization to do inference')
+parser.add_argument('--broadcast-buffers', default=True, type=bool, help='enables syncing buffers')
+parser.add_argument('--bucket-cap', default=25, type=int, help='controls the bucket size in MegaBytes')
+parser.add_argument('--jit-cache', type=str, default=str(hub), help="path to save/load jit model")
+parser.add_argument('--jit-trace', action='store_true',
+                    help='enable PyTorch jit trace graph mode')
+parser.add_argument('--calib-iters', default=8, type=int,
+                    help='iteration number for calibration')
+parser.add_argument('--calib-bs', default=32, type=int,
+                    metavar='N', help='mini-batch size for calibration')
+parser.add_argument('--perchannel-weight', default=False,
+                    help='do calibration with weight per channel quantization')
+parser.add_argument('--channels-last', action='store_true', help='enable channels last')
+parser.add_argument('--num-iterations', default=0, type=int)
+parser.add_argument('--tensorboard', default=None, action='store_true',
+                    help='Use Tensorboard to visualize the training metrics')
+parser.add_argument("--dummy", action="store_true", help='use dummy data for '
+                    'benchmark training or val')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('--benchmark', default=0, type=int, help='for int8 benchmark '
+                    'performance, move H2D out of E2E time')
+parser.add_argument("--save", help='Path to save entile model, save infernce mode, training is not available')
+parser.add_argument("--load", help='Path to load entile inference model')
+
+best_acc1 = 0
+
+def main():
+    args = parser.parse_args()
+
+    if args.xpu is not None and args.gpu is not None:
+        print('You need to choose running on NV GPU or XPU.')
+        sys.exit()
+
+    if args.gpu is not None and not torch.cuda.is_available():
+        print('Make sure cuda is enabled in torch.')
+        sys.exit()
+
+    if args.xpu is not None:
+        import intel_extension_for_pytorch as ipex
+
+    # only for training
+    if not args.evaluate:
+        if args.tf32:
+            print('doing TF32 training')
+            torch.xpu.set_fp32_math_mode(torch.xpu.FP32MathMode.TF32)
+        elif args.bf32:
+            args.bf16 = 1
+            print('doing BF32 training')
+            torch.xpu.set_fp32_math_mode(torch.xpu.FP32MathMode.BF32)
+        else:
+            torch.xpu.set_fp32_math_mode(torch.xpu.FP32MathMode.FP32)
+
+    if args.dist_backend == 'ccl':
+        try:
+            import oneccl_bindings_for_pytorch
+        except ImportError:
+            print("oneccl_bindings_for_pytorch not available!")
+
+    if args.int8 and (not args.evaluate or args.xpu is None):
+        print('For int8 quantization, it is only used in XPU inference, '
+              'you need to pass -e and --xpu [dev_id] in your command')
+        sys.exit()
+
+    if args.int8 and args.channels_last:
+        print('For int8 quantization, channels last is not supported for now')
+        sys.exit()
+
+    if args.tensorboard is not None:
+        from torch.utils.tensorboard import SummaryWriter
+        global writer
+        writer = SummaryWriter(log_dir='./tensorboard_log')
+        if args.num_iterations is not None:
+            warnings.warn('Tensorboard is displaying at epoch unit.')
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.world_size == -1:
+        mpi_world_size = int(os.environ.get('PMI_SIZE', -1))
+
+        if mpi_world_size > 0:
+            os.environ['MASTER_ADDR'] = args.dist_url #'127.0.0.1'
+            os.environ['MASTER_PORT'] = args.dist_port #'29500'
+            os.environ['RANK'] = os.environ.get('PMI_RANK', -1)
+            os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', -1)
+            args.rank = int(os.environ.get('PMI_RANK', -1))
+        args.world_size = int(os.environ.get("WORLD_SIZE", -1))
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    # 1 XPU card has 2 tile, and both are regarded as isolated devices/nodes
+    ngpus_per_node = 1
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(ngpus_per_node, args)
+
+def jit_calib(model, val_loader_calib, args):
+    print('doing int8 jit calibration')
+    jit_model_file = os.path.join(args.jit_cache, "rn50_jit_model_int8.pt")
+    if os.path.isfile(jit_model_file):
+        print("=> load jit model from {}".format(jit_model_file))
+        modelJit = torch.load(jit_model_file)
+        print("=> load jit model ... done")
+    else:
+        from torch.jit._recursive import wrap_cpp_module
+        from torch.quantization.quantize_jit import (
+            convert_jit,
+            prepare_jit,
+        )
+        modelJit = torch.jit.script(model)
+        modelJit = wrap_cpp_module(torch._C._jit_pass_fold_convbn(modelJit._c))
+
+        with torch.inference_mode():
+            if args.perchannel_weight:
+                qconfig = torch.quantization.QConfig(
+                    activation=torch.quantization.observer.MinMaxObserver.with_args(
+                        qscheme=torch.per_tensor_symmetric,
+                        reduce_range=False,
+                        dtype=torch.quint8
+                    ),
+                    weight=torch.quantization.default_per_channel_weight_observer
+                )
+            else:
+                qconfig = torch.quantization.QConfig(
+                    activation=torch.quantization.observer.MinMaxObserver.with_args(
+                        qscheme=torch.per_tensor_symmetric,
+                        reduce_range=False,
+                        dtype=torch.quint8
+                    ),
+                    weight=torch.quantization.default_weight_observer
+                )
+            modelJit = prepare_jit(modelJit, {'': qconfig}, True)
+
+            for i, (input, target) in enumerate(val_loader_calib):
+                calib = input.to(args.xpu)
+                modelJit(calib)
+
+                if i == args.calib_iters - 1:
+                    break
+            modelJit = convert_jit(modelJit, True)
+
+    return modelJit
+
+def main_worker(ngpus_per_node, args):
+    global best_acc1
+
+    if args.distributed:
+        if args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + args.gpu
+        init_method = 'tcp://' + args.dist_url + ':' + args.dist_port
+        dist.init_process_group(backend=args.dist_backend, init_method=init_method,
+                                world_size=args.world_size, rank=args.rank)
+
+        if args.gpu is not None:
+            args.gpu = args.rank
+        elif args.xpu is not None:
+            local_rank = os.environ['MPI_LOCALRANKID']
+            if 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ.keys():
+                local_rank = os.environ['OMPI_COMM_WORLD_LOCAL_RANK']
+            args.xpu = local_rank
+            print('world_size:{}, rank:{}, local_rank:{}'.format(args.world_size, args.rank, local_rank))
+
+    if args.gpu is not None:
+        print("Use GPU: {}".format(args.gpu))
+        args.gpu = "cuda:{}".format(args.gpu)
+    elif args.xpu is not None:
+        print("Use XPU: {}".format(args.xpu))
+        args.xpu = "xpu:{}".format(args.xpu)
+    else:
+        print("Use CPU")
+
+    # define loss function (criterion)
+    criterion = nn.CrossEntropyLoss()
+    if args.gpu is not None:
+        criterion = nn.CrossEntropyLoss().cuda(args.gpu)
+    elif args.xpu is not None:
+        criterion = nn.CrossEntropyLoss().xpu(args.xpu)
+    # create model
+    if args.load:
+        if os.path.isfile(args.load):
+            load_path = args.load
+            if args.jit_trace:
+                model = torch.jit.load(load_path)
+            elif args.evaluate and args.int8:
+                model = torch.jit.load(load_path)
+            else:
+                model = torch.load(load_path)
+                optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
+                                momentum=args.momentum, weight_decay=args.weight_decay)
+                scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
+        else:
+            print("=> no saved model found at '{}'".format(args.load))
+            sys.exit(1)
+    else:
+        if args.pretrained:
+            print("=> using pre-trained model '{}'".format(args.arch))
+            model = models.__dict__[args.arch](pretrained=True)
+        else:
+            print("=> creating model '{}'".format(args.arch))
+            model = models.__dict__[args.arch]()
+
+        # channels last
+        # TODO: this will be default memory format in future
+        if args.channels_last:
+            print('model is converted to channels last')
+            model = model.to(memory_format=torch.channels_last)
+
+        if args.distributed:
+            # For multiprocessing distributed, DistributedDataParallel constructor
+            # should always set the single device scope, otherwise,
+            # DistributedDataParallel will use all available devices.
+            if args.gpu is not None:
+                torch.cuda.set_device(args.gpu)
+                model.cuda(args.gpu)
+                # When using a single GPU per process and per
+                # DistributedDataParallel, we need to divide the batch size
+                # ourselves based on the total number of GPUs of the current node.
+                args.batch_size = int(args.batch_size / ngpus_per_node)
+                args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+            elif args.xpu is not None:
+                torch.xpu.set_device(args.xpu)
+                model.xpu(args.xpu)
+        elif args.gpu is not None:
+            torch.cuda.set_device(args.gpu)
+            model = model.cuda(args.gpu)
+            print('model to cuda')
+        elif args.xpu is not None:
+            torch.xpu.set_device(args.xpu)
+            model = model.xpu(args.xpu)
+            print('model to xpu')
+        else:
+            # do training or inference on CPU
+            pass
+
+        # define optimizer, and learning rate scheduler
+        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
+                                    momentum=args.momentum, weight_decay=args.weight_decay)
+
+        # torch.xpu.optimize is only for device xpu and no jit script
+        if args.xpu is not None:
+            if args.evaluate:
+                if not args.int8:
+                    print('doing torch xpu optimize for inference')
+                    model.eval()
+                    dtype = torch.float16 if args.fp16 else torch.float32
+                    dtype = torch.bfloat16 if args.bf16 else dtype
+                    sample_batch_size = int(args.batch_size / 2)
+                    # avoid batch size to be 0 after half divide
+                    if sample_batch_size == 0:
+                        sample_batch_size = 1
+                    sample_input = torch.randn((sample_batch_size, 3, 224, 224), device=args.xpu)
+                    model = torch.xpu.optimize(model=model, dtype=dtype, level="O1",
+                                            sample_input=sample_input)
+            else:
+                model.train()
+                print('doing torch xpu optimize for training')
+                model, optimizer = torch.xpu.optimize(model=model, optimizer=optimizer, level="O1",
+                                                    dtype=torch.bfloat16 if args.bf16 else torch.float32)
+
+        if args.distributed:
+            if args.xpu is not None:
+                # When using a single GPU per process and per
+                # DistributedDataParallel, we need to divide the batch size
+                # ourselves based on the total number of GPUs we have
+                args.batch_size = int(args.batch_size / ngpus_per_node)
+                args.workers = int(args.workers / ngpus_per_node)
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.xpu], broadcast_buffers=args.broadcast_buffers, bucket_cap_mb=args.bucket_cap)
+        """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+        scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
+
+        # optionally resume from a checkpoint
+        if args.resume:
+            if os.path.isfile(args.resume):
+                print("=> loading checkpoint '{}'".format(args.resume))
+                if args.gpu is None or args.xpu is None:
+                    checkpoint = torch.load(args.resume)
+                elif args.gpu is not None:
+                    # Map model to be loaded to specified single gpu.
+                    loc = 'cuda:{}'.format(args.gpu)
+                    checkpoint = torch.load(args.resume, map_location=loc)
+                elif args.xpu is not None:
+                    # Map model to be loaded to specified single gpu.
+                    loc = 'xpu:{}'.format(args.xpu)
+                    checkpoint = torch.load(args.resume, map_location=loc)
+                args.start_epoch = checkpoint['epoch']
+                best_acc1 = checkpoint['best_acc1']
+                if args.gpu is not None:
+                    # best_acc1 may be from a checkpoint from a different GPU
+                    best_acc1 = best_acc1.to(args.gpu)
+                model.load_state_dict(checkpoint['state_dict'])
+                optimizer.load_state_dict(checkpoint['optimizer'])
+                scheduler.load_state_dict(checkpoint['scheduler'])
+                print("=> loaded checkpoint '{}' (epoch {})"
+                    .format(args.resume, checkpoint['epoch']))
+            else:
+                print("=> no checkpoint found at '{}'".format(args.resume))
+
+        if args.gpu is not None:
+            cudnn.benchmark = True
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    # TODO: when use dummy dataset, the command shoud pass a dir, it needs revision in future
+    if args.dummy:
+        print("Dummy data is used!")
+        train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor())
+        val_dataset_size = args.num_iterations * args.batch_size if (args.dummy and args.num_iterations) else 50000
+        val_dataset = datasets.FakeData(val_dataset_size, (3, 224, 224), 1000, transforms.ToTensor())
+    else:
+        traindir = os.path.join(args.data, 'train')
+        valdir = os.path.join(args.data, 'val')
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+        train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+        val_dataset = datasets.ImageFolder(
+            valdir,
+            transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
+    else:
+        train_sampler = None
+        val_sampler = None
+
+    # [watch out] The pin memory is default enabled on CUDA for now in torch.
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, pin_memory_device="xpu", sampler=train_sampler)
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset, batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True, pin_memory_device="xpu", sampler=val_sampler)
+
+    # Profiling
+    profiling = os.environ.get("PROFILE", "OFF").upper() in ["1", "Y", "ON", "YES", "TRUE"]
+
+    use_autocast = False
+    if args.bf16 or args.fp16:
+        print('using autocast')
+        use_autocast = True
+
+    if args.evaluate:
+        if args.int8:
+            # calibration dataloader
+            val_loader_calib = torch.utils.data.DataLoader(
+                val_dataset, batch_size=args.calib_bs, shuffle=False,
+                num_workers=args.workers, pin_memory=True)
+
+            # do calibration and return quant model
+            if args.load:
+                model_calib = model
+            else:
+                model_calib = jit_calib(model, val_loader_calib, args)            
+            if args.save:
+                torch.jit.save(model_calib, args.save)
+            val_loader_inf = torch.utils.data.DataLoader(
+                val_dataset, batch_size=args.batch_size, shuffle=False,
+                num_workers=args.workers, pin_memory=True)
+
+            print('doing int8 inference')
+            validate_quantization(val_loader_inf, model_calib, criterion, profiling, args)
+        else:
+            # epoch pass 0
+            validate(val_loader, model, criterion, 0, profiling, use_autocast, args)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+
+        args.lr = scheduler.get_last_lr()[0]
+
+        # train for one epoch
+        train(train_loader, model, criterion, optimizer, epoch, profiling, use_autocast, args)
+
+        # evaluate on validation set
+        acc1 = validate(val_loader, model, criterion, epoch, profiling, use_autocast, args)
+
+        scheduler.step()
+
+        # remember best acc@1 and save checkpoint
+        is_best = acc1 > best_acc1
+        best_acc1 = max(acc1, best_acc1)
+
+        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                and args.rank % ngpus_per_node == 0):
+            save_checkpoint({
+                'epoch': epoch + 1,
+                'arch': args.arch,
+                'state_dict': model.state_dict(),
+                'best_acc1': best_acc1,
+                'optimizer' : optimizer.state_dict(),
+                'scheduler' : scheduler.state_dict()
+            }, is_best)
+
+    if args.tensorboard:
+        writer.close()
+
+def train(train_loader, model, criterion, optimizer, epoch, profiling, use_autocast, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    # record throughput
+    throughput = 0.0
+
+    data_start = time.time()
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - data_start)
+
+        if args.channels_last:
+            print('input to channels last')
+            images = images.to(memory_format=torch.channels_last)
+
+        start_time = time.time()
+
+        if args.xpu is not None:
+            # TODO: later the knieto will be used
+            with torch.autograd.profiler_legacy.profile(enabled=profiling, use_xpu=True, record_shapes=False) as prof:
+                images = images.to(args.xpu)
+                target = target.to(args.xpu)
+
+                with torch.xpu.amp.autocast(enabled=use_autocast, dtype=torch.bfloat16):
+                    # compute output
+                    output = model(images)
+                    loss = criterion(output, target)
+
+                # compute gradient and do SGD step
+                optimizer.zero_grad(set_to_none=True)
+                loss.backward()
+                optimizer.step()
+
+                # D2H
+                if args.xpu is not None:
+                    loss = loss.cpu()
+                    output = output.cpu()
+                    target = target.cpu()
+
+                # sync for time measurement on XPU
+                if args.xpu is not None:
+                    torch.xpu.synchronize(args.xpu)
+
+            if profiling:
+                profile_name = 'fp32'
+                if args.fp16:
+                    profile_name = 'fp16'
+                elif args.bf16:
+                    profile_name = 'bf16'
+                if args.distributed:
+                    profile_name += '.xpu.' + str(args.rank)
+                torch.save(prof.key_averages().table(sort_by="self_xpu_time_total"), './profiling.' + profile_name + '.train.pt')
+                torch.save(prof.table(sort_by="id", row_limit=100000), './profiling.' + profile_name + '.train.detailed.pt')
+        else:
+            activities = None
+            prof_sort = None
+            if profiling:
+                prof_sort = "self_cpu_time_total"
+                activities=[torch.profiler.ProfilerActivity.CPU]
+                if args.gpu is not None:
+                    activities.append(torch.profiler.ProfilerActivity.CUDA)
+                    prof_sort = "self_cuda_time_total"
+
+            with torch.profiler.profile(activities=activities, record_shapes=False) as prof:
+                if args.gpu is not None:
+                    images = images.cuda(args.gpu, non_blocking=True)
+                    target = target.cuda(args.gpu, non_blocking=True)
+
+                # compute output
+                output = model(images)
+                loss = criterion(output, target)
+
+                # compute gradient and do SGD step
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+                # D2H, as sync
+                if args.gpu is not None:
+                    loss = loss.cpu()
+                    output = output.cpu()
+                    target = target.cpu()
+
+            if profiling:
+                torch.save(prof.key_averages().table(sort_by=prof_sort), './profiling.card.' + str(args.xpu) + '.pt')
+
+        # measure elapsed time
+        duration_train = time.time() - start_time
+        batch_time.update(duration_train)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        if i % args.print_freq == 0:
+            progress.display(i + 1)
+
+        # exclude first iteration for calculating througput
+        if i >= 3:
+            throughput += args.batch_size / duration_train
+        data_start = time.time()
+
+        if i == (args.num_iterations - 1) and args.num_iterations >= 4:
+            print('Training performance: batch size:%d, throughput:%.2f image/sec'
+                  % (args.batch_size, throughput / (args.num_iterations - 3)))
+            sys.exit(0)
+        elif args.num_iterations == 0 and i == len(train_loader) - 1:
+            print('Training performance: batch size:%d, throughput:%.2f image/sec'
+                  % (args.batch_size, throughput / (len(train_loader) - 4)))
+            if args.tensorboard is None:
+                sys.exit(0)
+
+    if args.tensorboard:
+        draw_tensorboard(epoch, losses.avg, top1.avg, top5.avg, 'train', args)
+
+def validate(val_loader, model, criterion, epoch, profiling, use_autocast, args):
+
+    def run_validate(loader, model, autocast_dtype, base_progress=0):
+
+        # record throughput
+        throughput = 0.0
+
+        with torch.no_grad():
+            for i, (images, target) in enumerate(loader):
+                i = base_progress + i
+
+                if args.channels_last:
+                    images = images.to(memory_format=torch.channels_last)
+                    print('images convert to channels last')
+
+                start_time = time.time()
+
+                if args.xpu:
+                    with torch.autograd.profiler_legacy.profile(enabled=profiling, use_xpu=True, record_shapes=False) as prof:
+                        images = images.to(args.xpu)
+
+                        if args.jit_trace:
+                            # compute output
+                            output = model(images)
+                        else:
+                            with torch.xpu.amp.autocast(enabled=use_autocast, dtype=autocast_dtype):
+                                # compute output
+                                output = model(images)
+
+                        # sync for time measurement
+                        if args.xpu is not None:
+                            torch.xpu.synchronize(args.xpu)
+
+                    if profiling:
+                        profile_name = 'fp32'
+                        if args.fp16:
+                            profile_name = 'fp16'
+                        elif args.bf16:
+                            profile_name = 'bf16'
+                        torch.save(prof.key_averages().table(sort_by="self_xpu_time_total"), './profiling.' + profile_name + '.inf.pt')
+                        torch.save(prof.table(sort_by="id", row_limit=100000), './profiling.' + profile_name + '.inf.detailed.pt')
+                else:
+                    activities = None
+                    prof_sort = None
+                    if profiling:
+                        prof_sort = "self_cpu_time_total"
+                        activities=[torch.profiler.ProfilerActivity.CPU]
+                        if args.gpu is not None:
+                            activities.append(torch.profiler.ProfilerActivity.CUDA)
+                            prof_sort = "self_cuda_time_total"
+
+                    with torch.profiler.profile(activities=activities, record_shapes=False) as prof:
+                        if args.gpu is not None:
+                            images = images.cuda(args.gpu, non_blocking=True)
+
+                        # compute output
+                        output = model(images)
+
+                        # sync for time measurement
+                        if args.gpu is not None:
+                            torch.cuda.synchronize(args.gpu)
+
+                    if profiling:
+                        torch.save(prof.key_averages().table(sort_by=prof_sort), './profiling.pt')
+
+                # D2H
+                output = output.cpu()
+
+                # measure elapsed time
+                duration_eval = time.time() - start_time
+                batch_time.update(duration_eval)
+
+                loss = criterion(output.float(), target)
+
+                # measure accuracy and record loss
+                acc1, acc5 = accuracy(output.float(), target, topk=(1, 5))
+                losses.update(loss.item(), images.size(0))
+                top1.update(acc1[0], images.size(0))
+                top5.update(acc5[0], images.size(0))
+
+                if i % args.print_freq == 0:
+                    progress.display(i + 1)
+
+                # exclude first iteration for calculating througput
+                if i >= 1:
+                    throughput += args.batch_size / duration_eval
+
+                if i == (args.num_iterations - 1) and args.num_iterations >= 2:
+                    print('Evalution performance: batch size:%d, throughput:%.2f image/sec, Acc@1:%.2f, Acc@5:%.2f'
+                        % (args.batch_size, throughput / (args.num_iterations - 1), top1.avg, top5.avg))
+                    sys.exit(0)
+                elif args.num_iterations == 0 and i == len(val_loader) - 1:
+                    print('Evalution performance: batch size:%d, throughput:%.2f image/sec, Acc@1:%.2f, Acc@5:%.2f'
+                        % (args.batch_size, throughput / (len(val_loader) - 2), top1.avg, top5.avg))
+                    if args.tensorboard is None:
+                        sys.exit(0)
+
+    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
+    losses = AverageMeter('Loss', ':.4e', Summary.NONE)
+    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
+    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
+    progress = ProgressMeter(
+        len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    autocast_dtype = torch.float32
+    if args.fp16:
+        autocast_dtype = torch.float16
+    elif args.bf16:
+        autocast_dtype = torch.bfloat16
+
+    if args.jit_trace and not args.load:
+        trace_input = torch.randn(args.batch_size, 3, 224, 224).to(args.xpu)
+        print('jit trace')
+        # TODO: sometimes got -997 issue, JIRA: https://jira.devtools.intel.com/browse/GSD-1869
+        with torch.xpu.amp.autocast(enabled=use_autocast, dtype=autocast_dtype, cache_enabled=False):
+            model = torch.jit.trace(model, trace_input)
+
+    if args.save:
+        if args.jit_trace:
+            torch.jit.save(model, args.save)
+        else:
+            torch.save(model, args.save)
+    run_validate(val_loader, model, autocast_dtype)
+
+    progress.display_summary()
+
+    if args.tensorboard:
+        draw_tensorboard(epoch, None, top1.avg, top5.avg, 'val', args)
+
+    return top1.avg
+
+def validate_quantization(val_loader, model, criterion, profiling, args):
+    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
+    losses = AverageMeter('Loss', ':.4e', Summary.NONE)
+    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
+    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
+    progress = ProgressMeter(
+        len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    # record throughput
+    throughput = 0.0
+
+    with torch.inference_mode():
+        for i, (images, target) in enumerate(val_loader):
+            if args.xpu is not None and args.benchmark == 1:
+                images = images.to(args.xpu)
+
+            start = time.time()
+            with torch.autograd.profiler_legacy.profile(enabled=profiling, use_xpu=True, record_shapes=False) as prof:
+
+                if args.xpu is not None and args.benchmark == 0:
+                    images = images.to(args.xpu)
+
+                # compute output
+                output = model(images)
+
+                # D2H
+                output = output.to("cpu")
+
+                # sync for time measurement
+                torch.xpu.synchronize(args.xpu)
+
+                # measure elapsed time
+                end = time.time()
+                batch_time.update(end - start)
+                duration_eval = end - start
+
+            if profiling:
+                torch.save(prof.key_averages().table(sort_by="self_xpu_time_total"), './profiling.int8.inf.pt')
+                torch.save(prof.table(sort_by="id", row_limit=100000), './profiling.detailed.int8.inf.pt')
+
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            if i % args.print_freq == 0:
+                progress.display(i + 1)
+
+            # exclude first iteration for calculating througput
+            perf_start_iter = 1
+            if args.benchmark == 1 and args.num_iterations >= 500:
+                perf_start_iter = math.floor(args.num_iterations * 0.7)
+            if i >= perf_start_iter:
+                throughput += args.batch_size / duration_eval
+
+            if i == (args.num_iterations - 1) and args.num_iterations >= 2:
+                print('Quantization Evalution performance: batch size:%d, throughput:%.2f image/sec, Acc@1:%.2f, Acc@5:%.2f'
+                    % (args.batch_size, throughput / (args.num_iterations - perf_start_iter), top1.avg, top5.avg))
+                sys.exit(0)
+            elif args.num_iterations == 0 and i == len(val_loader) - 1:
+                print('Quantization Evalution performance: batch size:%d, throughput:%.2f image/sec, Acc@1:%.2f, Acc@5:%.2f'
+                    % (args.batch_size, throughput / (len(val_loader) - 2), top1.avg, top5.avg))
+                sys.exit(0)
+
+        progress.display_summary()
+
+    return top1.avg
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+def qnormalize(tensor, mean, std, scl):
+    dtype = tensor.dtype
+    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
+    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
+    if mean.ndim == 1:
+        mean = mean[:, None, None]
+    if std.ndim == 1:
+        std = std[:, None, None]
+
+    tensor.sub_(mean).div_(std)#.mul_(scale)#tensor.sub_(255 * mean).mul(128/255*(1/(1-0.406)))
+    out = torch.quantize_per_tensor(tensor, scale=scl, zero_point=0, dtype=torch.qint8)
+    return out
+
+def compute_scale(val_loader_com):
+    for i, (input, target) in enumerate(val_loader_com):
+        scale = 1.0 / (128 / torch.max(input))
+        return scale
+
+class QNormalize(object):
+    def __init__(self, mean, std, scale):
+        self.mean = mean
+        self.std = std
+        self.scale = scale
+
+    def __call__(self, tensor):
+        return qnormalize(tensor, self.mean, self.std, self.scale)
+
+class Summary(Enum):
+    NONE = 0
+    AVERAGE = 1
+    SUM = 2
+    COUNT = 3
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
+        self.name = name
+        self.fmt = fmt
+        self.summary_type = summary_type
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def all_reduce(self):
+        total = torch.FloatTensor([self.sum, self.count])
+        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
+        self.sum, self.count = total.tolist()
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+    def summary(self):
+        fmtstr = ''
+        if self.summary_type is Summary.NONE:
+            fmtstr = ''
+        elif self.summary_type is Summary.AVERAGE:
+            fmtstr = '{name} {avg:.3f}'
+        elif self.summary_type is Summary.SUM:
+            fmtstr = '{name} {sum:.3f}'
+        elif self.summary_type is Summary.COUNT:
+            fmtstr = '{name} {count:.3f}'
+        else:
+            raise ValueError('invalid summary type %r' % self.summary_type)
+
+        return fmtstr.format(**self.__dict__)
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def display_summary(self):
+        entries = [" *"]
+        entries += [meter.summary() for meter in self.meters]
+        print(' '.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        pred = pred.cpu()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+def draw_tensorboard(num_epoch, avg_loss, avg_acc1, avg_acc5, mode, args):
+    if mode == 'train':
+        writer.add_scalar('training: learning rate', args.lr, num_epoch)
+        writer.add_scalar('training: loss', avg_loss, num_epoch)
+        writer.add_scalar('training: top1 acc', avg_acc1, num_epoch)
+        writer.add_scalar('training: top5 acc', avg_acc5, num_epoch)
+    else:
+        writer.add_scalar('val: top1 acc', avg_acc1, num_epoch)
+        writer.add_scalar('val: top5 acc', avg_acc5, num_epoch)
+
+if __name__ == '__main__':
+    main()
diff --git a/models/image_recognition/pytorch/resnet50v1_5/training/gpu/__init__.py b/models/image_recognition/pytorch/resnet50v1_5/training/gpu/__init__.py
new file mode 100644
index 000000000..6f72f91ea
--- /dev/null
+++ b/models/image_recognition/pytorch/resnet50v1_5/training/gpu/__init__.py
@@ -0,0 +1,19 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
diff --git a/models/image_recognition/pytorch/resnet50v1_5/training/gpu/main.py b/models/image_recognition/pytorch/resnet50v1_5/training/gpu/main.py
new file mode 100644
index 000000000..af229fd26
--- /dev/null
+++ b/models/image_recognition/pytorch/resnet50v1_5/training/gpu/main.py
@@ -0,0 +1,1059 @@
+#
+# ****************************************************************************
+# Copyright 2019-2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ****************************************************************************
+
+# ****************************************************************************
+# BSD 3-Clause License
+# # This model is originally from the PyTorch Examples repo
+# (https://github.com/pytorch/examples/blob/master/imagenet/main.py)
+# ****************************************************************************
+
+# ****************************************************************************
+# Copyright (c) 2017,
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are met:
+#
+# * Redistributions of source code must retain the above copyright notice, this
+#   list of conditions and the following disclaimer.
+#
+# * Redistributions in binary form must reproduce the above copyright notice,
+#   this list of conditions and the following disclaimer in the documentation
+#   and/or other materials provided with the distribution.
+#
+# * Neither the name of the copyright holder nor the names of its
+#   contributors may be used to endorse or promote products derived from
+#   this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+# DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+# OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+# ****************************************************************************
+
+import argparse
+import os
+import sys
+import random
+import shutil
+import time
+import warnings
+from enum import Enum
+
+import torch
+import torch.nn as nn
+import torch.nn.parallel
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+import torch.optim
+from torch.optim.lr_scheduler import StepLR
+import torch.multiprocessing as mp
+import torch.utils.data
+import torch.utils.data.distributed
+import torchvision.transforms as transforms
+import torchvision.datasets as datasets
+import torchvision.models as models
+from torch.utils.data import Subset
+import math
+
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+
+model_names = sorted(name for name in models.__dict__
+    if name.islower() and not name.startswith("__")
+    and callable(models.__dict__[name]))
+
+cwd = os.path.dirname(os.path.abspath(__file__))
+hub = os.path.expanduser("~/.cache/torch/intel")
+if not os.path.exists(hub):
+    os.makedirs(hub)
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('data', metavar='DIR', nargs='?', default='imagenet',
+                    help='path to dataset (default: imagenet)')
+parser.add_argument('-a', '--arch', metavar='ARCH', default='resnet18',
+                    choices=model_names,
+                    help='model architecture: ' +
+                        ' | '.join(model_names) +
+                        ' (default: resnet18)')
+parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                    help='number of data loading workers (default: 4)')
+parser.add_argument('--epochs', default=90, type=int, metavar='N',
+                    help='number of total epochs to run')
+parser.add_argument('--start-epoch', default=0, type=int, metavar='N',
+                    help='manual epoch number (useful on restarts)')
+parser.add_argument('-b', '--batch-size', default=256, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 256), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--lr', '--learning-rate', default=0.1, type=float,
+                    metavar='LR', help='initial learning rate', dest='lr')
+parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                    help='momentum')
+parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                    metavar='W', help='weight decay (default: 1e-4)',
+                    dest='weight_decay')
+parser.add_argument('-p', '--print-freq', default=1, type=int,
+                    metavar='N', help='print frequency (default: 10)')
+parser.add_argument('--resume', default='', type=str, metavar='PATH',
+                    help='path to latest checkpoint (default: none)')
+parser.add_argument('-e', '--evaluate', dest='evaluate', action='store_true',
+                    help='evaluate model on validation set')
+parser.add_argument('--pretrained', dest='pretrained', action='store_true',
+                    help='use pre-trained model')
+parser.add_argument('--world-size', default=-1, type=int,
+                    help='number of nodes for distributed training')
+parser.add_argument('--rank', default=-1, type=int,
+                    help='node rank for distributed training')
+parser.add_argument('--dist-url', default='127.0.0.1', type=str,
+                    help='url used to set up distributed training')
+parser.add_argument('--dist-port', default='29500', type=str,
+                    help='url port used to set up distributed training')
+parser.add_argument('--dist-backend', default='ccl', type=str,
+                    help='distributed backend, default is torch-ccl')
+parser.add_argument('--seed', default=None, type=int,
+                    help='seed for initializing training. ')
+parser.add_argument('--gpu', default=None, type=int,
+                    help='GPU id to use.')
+parser.add_argument('--xpu', default=None, type=int,
+                    help='XPU id to use.')
+parser.add_argument('--tf32', default=0, type=int, help='Datatype used: TF32')
+parser.add_argument('--bf32', default=0, type=int, help='Datatype used: BF32')
+parser.add_argument('--fp16', default=0, type=int, help='Datatype used: FP16')
+parser.add_argument('--bf16', default=0, type=int, help='Datatype used: BF16')
+parser.add_argument('--int8', default=0, type=int, help='Use int8 quantization to do inference')
+parser.add_argument('--broadcast-buffers', default=True, type=bool, help='enables syncing buffers')
+parser.add_argument('--bucket-cap', default=25, type=int, help='controls the bucket size in MegaBytes')
+parser.add_argument('--jit-cache', type=str, default=str(hub), help="path to save/load jit model")
+parser.add_argument('--jit-trace', action='store_true',
+                    help='enable PyTorch jit trace graph mode')
+parser.add_argument('--calib-iters', default=8, type=int,
+                    help='iteration number for calibration')
+parser.add_argument('--calib-bs', default=32, type=int,
+                    metavar='N', help='mini-batch size for calibration')
+parser.add_argument('--perchannel-weight', default=False,
+                    help='do calibration with weight per channel quantization')
+parser.add_argument('--channels-last', action='store_true', help='enable channels last')
+parser.add_argument('--num-iterations', default=0, type=int)
+parser.add_argument('--tensorboard', default=None, action='store_true',
+                    help='Use Tensorboard to visualize the training metrics')
+parser.add_argument("--dummy", action="store_true", help='use dummy data for '
+                    'benchmark training or val')
+parser.add_argument('--multiprocessing-distributed', action='store_true',
+                    help='Use multi-processing distributed training to launch '
+                         'N processes per node, which has N GPUs. This is the '
+                         'fastest way to use PyTorch for either single node or '
+                         'multi node data parallel training')
+parser.add_argument('--benchmark', default=0, type=int, help='for int8 benchmark '
+                    'performance, move H2D out of E2E time')
+parser.add_argument("--save", help='Path to save entile model, save infernce mode, training is not available')
+parser.add_argument("--load", help='Path to load entile inference model')
+
+best_acc1 = 0
+
+def main():
+    args = parser.parse_args()
+
+    if args.xpu is not None and args.gpu is not None:
+        print('You need to choose running on NV GPU or XPU.')
+        sys.exit()
+
+    if args.gpu is not None and not torch.cuda.is_available():
+        print('Make sure cuda is enabled in torch.')
+        sys.exit()
+
+    if args.xpu is not None:
+        import intel_extension_for_pytorch as ipex
+
+    # only for training
+    if not args.evaluate:
+        if args.tf32:
+            print('doing TF32 training')
+            torch.xpu.set_fp32_math_mode(torch.xpu.FP32MathMode.TF32)
+        elif args.bf32:
+            args.bf16 = 1
+            print('doing BF32 training')
+            torch.xpu.set_fp32_math_mode(torch.xpu.FP32MathMode.BF32)
+        else:
+            torch.xpu.set_fp32_math_mode(torch.xpu.FP32MathMode.FP32)
+
+    if args.dist_backend == 'ccl':
+        try:
+            import oneccl_bindings_for_pytorch
+        except ImportError:
+            print("oneccl_bindings_for_pytorch not available!")
+
+    if args.int8 and (not args.evaluate or args.xpu is None):
+        print('For int8 quantization, it is only used in XPU inference, '
+              'you need to pass -e and --xpu [dev_id] in your command')
+        sys.exit()
+
+    if args.int8 and args.channels_last:
+        print('For int8 quantization, channels last is not supported for now')
+        sys.exit()
+
+    if args.tensorboard is not None:
+        from torch.utils.tensorboard import SummaryWriter
+        global writer
+        writer = SummaryWriter(log_dir='./tensorboard_log')
+        if args.num_iterations is not None:
+            warnings.warn('Tensorboard is displaying at epoch unit.')
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        torch.manual_seed(args.seed)
+        cudnn.deterministic = True
+        warnings.warn('You have chosen to seed training. '
+                      'This will turn on the CUDNN deterministic setting, '
+                      'which can slow down your training considerably! '
+                      'You may see unexpected behavior when restarting '
+                      'from checkpoints.')
+
+    if args.gpu is not None:
+        warnings.warn('You have chosen a specific GPU. This will completely '
+                      'disable data parallelism.')
+
+    if args.world_size == -1:
+        mpi_world_size = int(os.environ.get('PMI_SIZE', -1))
+
+        if mpi_world_size > 0:
+            os.environ['MASTER_ADDR'] = args.dist_url #'127.0.0.1'
+            os.environ['MASTER_PORT'] = args.dist_port #'29500'
+            os.environ['RANK'] = os.environ.get('PMI_RANK', -1)
+            os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', -1)
+            args.rank = int(os.environ.get('PMI_RANK', -1))
+        args.world_size = int(os.environ.get("WORLD_SIZE", -1))
+
+    args.distributed = args.world_size > 1 or args.multiprocessing_distributed
+
+    # 1 XPU card has 2 tile, and both are regarded as isolated devices/nodes
+    ngpus_per_node = 1
+    if args.multiprocessing_distributed:
+        # Since we have ngpus_per_node processes per node, the total world_size
+        # needs to be adjusted accordingly
+        args.world_size = ngpus_per_node * args.world_size
+        # Use torch.multiprocessing.spawn to launch distributed processes: the
+        # main_worker process function
+        mp.spawn(main_worker, nprocs=ngpus_per_node, args=(ngpus_per_node, args))
+    else:
+        # Simply call main_worker function
+        main_worker(ngpus_per_node, args)
+
+def jit_calib(model, val_loader_calib, args):
+    print('doing int8 jit calibration')
+    jit_model_file = os.path.join(args.jit_cache, "rn50_jit_model_int8.pt")
+    if os.path.isfile(jit_model_file):
+        print("=> load jit model from {}".format(jit_model_file))
+        modelJit = torch.load(jit_model_file)
+        print("=> load jit model ... done")
+    else:
+        from torch.jit._recursive import wrap_cpp_module
+        from torch.quantization.quantize_jit import (
+            convert_jit,
+            prepare_jit,
+        )
+        modelJit = torch.jit.script(model)
+        modelJit = wrap_cpp_module(torch._C._jit_pass_fold_convbn(modelJit._c))
+
+        with torch.inference_mode():
+            if args.perchannel_weight:
+                qconfig = torch.quantization.QConfig(
+                    activation=torch.quantization.observer.MinMaxObserver.with_args(
+                        qscheme=torch.per_tensor_symmetric,
+                        reduce_range=False,
+                        dtype=torch.quint8
+                    ),
+                    weight=torch.quantization.default_per_channel_weight_observer
+                )
+            else:
+                qconfig = torch.quantization.QConfig(
+                    activation=torch.quantization.observer.MinMaxObserver.with_args(
+                        qscheme=torch.per_tensor_symmetric,
+                        reduce_range=False,
+                        dtype=torch.quint8
+                    ),
+                    weight=torch.quantization.default_weight_observer
+                )
+            modelJit = prepare_jit(modelJit, {'': qconfig}, True)
+
+            for i, (input, target) in enumerate(val_loader_calib):
+                calib = input.to(args.xpu)
+                modelJit(calib)
+
+                if i == args.calib_iters - 1:
+                    break
+            modelJit = convert_jit(modelJit, True)
+
+    return modelJit
+
+def main_worker(ngpus_per_node, args):
+    global best_acc1
+
+    if args.distributed:
+        if args.rank == -1:
+            args.rank = int(os.environ["RANK"])
+        if args.multiprocessing_distributed:
+            # For multiprocessing distributed training, rank needs to be the
+            # global rank among all the processes
+            args.rank = args.rank * ngpus_per_node + args.gpu
+        init_method = 'tcp://' + args.dist_url + ':' + args.dist_port
+        dist.init_process_group(backend=args.dist_backend, init_method=init_method,
+                                world_size=args.world_size, rank=args.rank)
+
+        if args.gpu is not None:
+            args.gpu = args.rank
+        elif args.xpu is not None:
+            local_rank = os.environ['MPI_LOCALRANKID']
+            if 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ.keys():
+                local_rank = os.environ['OMPI_COMM_WORLD_LOCAL_RANK']
+            args.xpu = local_rank
+            print('world_size:{}, rank:{}, local_rank:{}'.format(args.world_size, args.rank, local_rank))
+
+    if args.gpu is not None:
+        print("Use GPU: {}".format(args.gpu))
+        args.gpu = "cuda:{}".format(args.gpu)
+    elif args.xpu is not None:
+        print("Use XPU: {}".format(args.xpu))
+        args.xpu = "xpu:{}".format(args.xpu)
+    else:
+        print("Use CPU")
+
+    # define loss function (criterion)
+    criterion = nn.CrossEntropyLoss()
+    if args.gpu is not None:
+        criterion = nn.CrossEntropyLoss().cuda(args.gpu)
+    elif args.xpu is not None:
+        criterion = nn.CrossEntropyLoss().xpu(args.xpu)
+    # create model
+    if args.load:
+        if os.path.isfile(args.load):
+            load_path = args.load
+            if args.jit_trace:
+                model = torch.jit.load(load_path)
+            elif args.evaluate and args.int8:
+                model = torch.jit.load(load_path)
+            else:
+                model = torch.load(load_path)
+                optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
+                                momentum=args.momentum, weight_decay=args.weight_decay)
+                scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
+        else:
+            print("=> no saved model found at '{}'".format(args.load))
+            sys.exit(1)
+    else:
+        if args.pretrained:
+            print("=> using pre-trained model '{}'".format(args.arch))
+            model = models.__dict__[args.arch](pretrained=True)
+        else:
+            print("=> creating model '{}'".format(args.arch))
+            model = models.__dict__[args.arch]()
+
+        # channels last
+        # TODO: this will be default memory format in future
+        if args.channels_last:
+            print('model is converted to channels last')
+            model = model.to(memory_format=torch.channels_last)
+
+        if args.distributed:
+            # For multiprocessing distributed, DistributedDataParallel constructor
+            # should always set the single device scope, otherwise,
+            # DistributedDataParallel will use all available devices.
+            if args.gpu is not None:
+                torch.cuda.set_device(args.gpu)
+                model.cuda(args.gpu)
+                # When using a single GPU per process and per
+                # DistributedDataParallel, we need to divide the batch size
+                # ourselves based on the total number of GPUs of the current node.
+                args.batch_size = int(args.batch_size / ngpus_per_node)
+                args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node)
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+            elif args.xpu is not None:
+                torch.xpu.set_device(args.xpu)
+                model.xpu(args.xpu)
+        elif args.gpu is not None:
+            torch.cuda.set_device(args.gpu)
+            model = model.cuda(args.gpu)
+            print('model to cuda')
+        elif args.xpu is not None:
+            torch.xpu.set_device(args.xpu)
+            model = model.xpu(args.xpu)
+            print('model to xpu')
+        else:
+            # do training or inference on CPU
+            pass
+
+        # define optimizer, and learning rate scheduler
+        optimizer = torch.optim.SGD(model.parameters(), lr=args.lr,
+                                    momentum=args.momentum, weight_decay=args.weight_decay)
+
+        # torch.xpu.optimize is only for device xpu and no jit script
+        if args.xpu is not None:
+            if args.evaluate:
+                if not args.int8:
+                    print('doing torch xpu optimize for inference')
+                    model.eval()
+                    dtype = torch.float16 if args.fp16 else torch.float32
+                    dtype = torch.bfloat16 if args.bf16 else dtype
+                    sample_batch_size = int(args.batch_size / 2)
+                    # avoid batch size to be 0 after half divide
+                    if sample_batch_size == 0:
+                        sample_batch_size = 1
+                    sample_input = torch.randn((sample_batch_size, 3, 224, 224), device=args.xpu)
+                    model = torch.xpu.optimize(model=model, dtype=dtype, level="O1",
+                                            sample_input=sample_input)
+            else:
+                model.train()
+                print('doing torch xpu optimize for training')
+                model, optimizer = torch.xpu.optimize(model=model, optimizer=optimizer, level="O1",
+                                                    dtype=torch.bfloat16 if args.bf16 else torch.float32)
+
+        if args.distributed:
+            if args.xpu is not None:
+                # When using a single GPU per process and per
+                # DistributedDataParallel, we need to divide the batch size
+                # ourselves based on the total number of GPUs we have
+                args.batch_size = int(args.batch_size / ngpus_per_node)
+                args.workers = int(args.workers / ngpus_per_node)
+                model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.xpu], broadcast_buffers=args.broadcast_buffers, bucket_cap_mb=args.bucket_cap)
+        """Sets the learning rate to the initial LR decayed by 10 every 30 epochs"""
+        scheduler = StepLR(optimizer, step_size=30, gamma=0.1)
+
+        # optionally resume from a checkpoint
+        if args.resume:
+            if os.path.isfile(args.resume):
+                print("=> loading checkpoint '{}'".format(args.resume))
+                if args.gpu is None or args.xpu is None:
+                    checkpoint = torch.load(args.resume)
+                elif args.gpu is not None:
+                    # Map model to be loaded to specified single gpu.
+                    loc = 'cuda:{}'.format(args.gpu)
+                    checkpoint = torch.load(args.resume, map_location=loc)
+                elif args.xpu is not None:
+                    # Map model to be loaded to specified single gpu.
+                    loc = 'xpu:{}'.format(args.xpu)
+                    checkpoint = torch.load(args.resume, map_location=loc)
+                args.start_epoch = checkpoint['epoch']
+                best_acc1 = checkpoint['best_acc1']
+                if args.gpu is not None:
+                    # best_acc1 may be from a checkpoint from a different GPU
+                    best_acc1 = best_acc1.to(args.gpu)
+                model.load_state_dict(checkpoint['state_dict'])
+                optimizer.load_state_dict(checkpoint['optimizer'])
+                scheduler.load_state_dict(checkpoint['scheduler'])
+                print("=> loaded checkpoint '{}' (epoch {})"
+                    .format(args.resume, checkpoint['epoch']))
+            else:
+                print("=> no checkpoint found at '{}'".format(args.resume))
+
+        if args.gpu is not None:
+            cudnn.benchmark = True
+
+    # Data loading code
+    traindir = os.path.join(args.data, 'train')
+    valdir = os.path.join(args.data, 'val')
+    normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+
+    # TODO: when use dummy dataset, the command shoud pass a dir, it needs revision in future
+    if args.dummy:
+        print("Dummy data is used!")
+        train_dataset = datasets.FakeData(1281167, (3, 224, 224), 1000, transforms.ToTensor())
+        val_dataset_size = args.num_iterations * args.batch_size if (args.dummy and args.num_iterations) else 50000
+        val_dataset = datasets.FakeData(val_dataset_size, (3, 224, 224), 1000, transforms.ToTensor())
+    else:
+        traindir = os.path.join(args.data, 'train')
+        valdir = os.path.join(args.data, 'val')
+        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                                     std=[0.229, 0.224, 0.225])
+        train_dataset = datasets.ImageFolder(
+            traindir,
+            transforms.Compose([
+                transforms.RandomResizedCrop(224),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+        val_dataset = datasets.ImageFolder(
+            valdir,
+            transforms.Compose([
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                normalize,
+            ]))
+
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset)
+        val_sampler = torch.utils.data.distributed.DistributedSampler(val_dataset, shuffle=False, drop_last=True)
+    else:
+        train_sampler = None
+        val_sampler = None
+
+    # [watch out] The pin memory is default enabled on CUDA for now in torch.
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None),
+        num_workers=args.workers, pin_memory=True, pin_memory_device="xpu", sampler=train_sampler)
+
+    val_loader = torch.utils.data.DataLoader(
+        val_dataset, batch_size=args.batch_size, shuffle=False,
+        num_workers=args.workers, pin_memory=True, pin_memory_device="xpu", sampler=val_sampler)
+
+    # Profiling
+    profiling = os.environ.get("PROFILE", "OFF").upper() in ["1", "Y", "ON", "YES", "TRUE"]
+
+    use_autocast = False
+    if args.bf16 or args.fp16:
+        print('using autocast')
+        use_autocast = True
+
+    if args.evaluate:
+        if args.int8:
+            # calibration dataloader
+            val_loader_calib = torch.utils.data.DataLoader(
+                val_dataset, batch_size=args.calib_bs, shuffle=False,
+                num_workers=args.workers, pin_memory=True)
+
+            # do calibration and return quant model
+            if args.load:
+                model_calib = model
+            else:
+                model_calib = jit_calib(model, val_loader_calib, args)            
+            if args.save:
+                torch.jit.save(model_calib, args.save)
+            val_loader_inf = torch.utils.data.DataLoader(
+                val_dataset, batch_size=args.batch_size, shuffle=False,
+                num_workers=args.workers, pin_memory=True)
+
+            print('doing int8 inference')
+            validate_quantization(val_loader_inf, model_calib, criterion, profiling, args)
+        else:
+            # epoch pass 0
+            validate(val_loader, model, criterion, 0, profiling, use_autocast, args)
+        return
+
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+
+        args.lr = scheduler.get_last_lr()[0]
+
+        # train for one epoch
+        train(train_loader, model, criterion, optimizer, epoch, profiling, use_autocast, args)
+
+        # evaluate on validation set
+        acc1 = validate(val_loader, model, criterion, epoch, profiling, use_autocast, args)
+
+        scheduler.step()
+
+        # remember best acc@1 and save checkpoint
+        is_best = acc1 > best_acc1
+        best_acc1 = max(acc1, best_acc1)
+
+        if not args.multiprocessing_distributed or (args.multiprocessing_distributed
+                and args.rank % ngpus_per_node == 0):
+            save_checkpoint({
+                'epoch': epoch + 1,
+                'arch': args.arch,
+                'state_dict': model.state_dict(),
+                'best_acc1': best_acc1,
+                'optimizer' : optimizer.state_dict(),
+                'scheduler' : scheduler.state_dict()
+            }, is_best)
+
+    if args.tensorboard:
+        writer.close()
+
+def train(train_loader, model, criterion, optimizer, epoch, profiling, use_autocast, args):
+    batch_time = AverageMeter('Time', ':6.3f')
+    data_time = AverageMeter('Data', ':6.3f')
+    losses = AverageMeter('Loss', ':.4e')
+    top1 = AverageMeter('Acc@1', ':6.2f')
+    top5 = AverageMeter('Acc@5', ':6.2f')
+    progress = ProgressMeter(
+        len(train_loader),
+        [batch_time, data_time, losses, top1, top5],
+        prefix="Epoch: [{}]".format(epoch))
+
+    # switch to train mode
+    model.train()
+
+    # record throughput
+    throughput = 0.0
+
+    data_start = time.time()
+    for i, (images, target) in enumerate(train_loader):
+        # measure data loading time
+        data_time.update(time.time() - data_start)
+
+        if args.channels_last:
+            print('input to channels last')
+            images = images.to(memory_format=torch.channels_last)
+
+        start_time = time.time()
+
+        if args.xpu is not None:
+            # TODO: later the knieto will be used
+            with torch.autograd.profiler_legacy.profile(enabled=profiling, use_xpu=True, record_shapes=False) as prof:
+                images = images.to(args.xpu)
+                target = target.to(args.xpu)
+
+                with torch.xpu.amp.autocast(enabled=use_autocast, dtype=torch.bfloat16):
+                    # compute output
+                    output = model(images)
+                    loss = criterion(output, target)
+
+                # compute gradient and do SGD step
+                optimizer.zero_grad(set_to_none=True)
+                loss.backward()
+                optimizer.step()
+
+                # D2H
+                if args.xpu is not None:
+                    loss = loss.cpu()
+                    output = output.cpu()
+                    target = target.cpu()
+
+                # sync for time measurement on XPU
+                if args.xpu is not None:
+                    torch.xpu.synchronize(args.xpu)
+
+            if profiling:
+                profile_name = 'fp32'
+                if args.fp16:
+                    profile_name = 'fp16'
+                elif args.bf16:
+                    profile_name = 'bf16'
+                if args.distributed:
+                    profile_name += '.xpu.' + str(args.rank)
+                torch.save(prof.key_averages().table(sort_by="self_xpu_time_total"), './profiling.' + profile_name + '.train.pt')
+                torch.save(prof.table(sort_by="id", row_limit=100000), './profiling.' + profile_name + '.train.detailed.pt')
+        else:
+            activities = None
+            prof_sort = None
+            if profiling:
+                prof_sort = "self_cpu_time_total"
+                activities=[torch.profiler.ProfilerActivity.CPU]
+                if args.gpu is not None:
+                    activities.append(torch.profiler.ProfilerActivity.CUDA)
+                    prof_sort = "self_cuda_time_total"
+
+            with torch.profiler.profile(activities=activities, record_shapes=False) as prof:
+                if args.gpu is not None:
+                    images = images.cuda(args.gpu, non_blocking=True)
+                    target = target.cuda(args.gpu, non_blocking=True)
+
+                # compute output
+                output = model(images)
+                loss = criterion(output, target)
+
+                # compute gradient and do SGD step
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+
+                # D2H, as sync
+                if args.gpu is not None:
+                    loss = loss.cpu()
+                    output = output.cpu()
+                    target = target.cpu()
+
+            if profiling:
+                torch.save(prof.key_averages().table(sort_by=prof_sort), './profiling.card.' + str(args.xpu) + '.pt')
+
+        # measure elapsed time
+        duration_train = time.time() - start_time
+        batch_time.update(duration_train)
+
+        # measure accuracy and record loss
+        acc1, acc5 = accuracy(output, target, topk=(1, 5))
+        losses.update(loss.item(), images.size(0))
+        top1.update(acc1[0], images.size(0))
+        top5.update(acc5[0], images.size(0))
+
+        if i % args.print_freq == 0:
+            progress.display(i + 1)
+
+        # exclude first iteration for calculating througput
+        if i >= 3:
+            throughput += args.batch_size / duration_train
+        data_start = time.time()
+
+        if i == (args.num_iterations - 1) and args.num_iterations >= 4:
+            print('Training performance: batch size:%d, throughput:%.2f image/sec'
+                  % (args.batch_size, throughput / (args.num_iterations - 3)))
+            sys.exit(0)
+        elif args.num_iterations == 0 and i == len(train_loader) - 1:
+            print('Training performance: batch size:%d, throughput:%.2f image/sec'
+                  % (args.batch_size, throughput / (len(train_loader) - 4)))
+            if args.tensorboard is None:
+                sys.exit(0)
+
+    if args.tensorboard:
+        draw_tensorboard(epoch, losses.avg, top1.avg, top5.avg, 'train', args)
+
+def validate(val_loader, model, criterion, epoch, profiling, use_autocast, args):
+
+    def run_validate(loader, model, autocast_dtype, base_progress=0):
+
+        # record throughput
+        throughput = 0.0
+
+        with torch.no_grad():
+            for i, (images, target) in enumerate(loader):
+                i = base_progress + i
+
+                if args.channels_last:
+                    images = images.to(memory_format=torch.channels_last)
+                    print('images convert to channels last')
+
+                start_time = time.time()
+
+                if args.xpu:
+                    with torch.autograd.profiler_legacy.profile(enabled=profiling, use_xpu=True, record_shapes=False) as prof:
+                        images = images.to(args.xpu)
+
+                        if args.jit_trace:
+                            # compute output
+                            output = model(images)
+                        else:
+                            with torch.xpu.amp.autocast(enabled=use_autocast, dtype=autocast_dtype):
+                                # compute output
+                                output = model(images)
+
+                        # sync for time measurement
+                        if args.xpu is not None:
+                            torch.xpu.synchronize(args.xpu)
+
+                    if profiling:
+                        profile_name = 'fp32'
+                        if args.fp16:
+                            profile_name = 'fp16'
+                        elif args.bf16:
+                            profile_name = 'bf16'
+                        torch.save(prof.key_averages().table(sort_by="self_xpu_time_total"), './profiling.' + profile_name + '.inf.pt')
+                        torch.save(prof.table(sort_by="id", row_limit=100000), './profiling.' + profile_name + '.inf.detailed.pt')
+                else:
+                    activities = None
+                    prof_sort = None
+                    if profiling:
+                        prof_sort = "self_cpu_time_total"
+                        activities=[torch.profiler.ProfilerActivity.CPU]
+                        if args.gpu is not None:
+                            activities.append(torch.profiler.ProfilerActivity.CUDA)
+                            prof_sort = "self_cuda_time_total"
+
+                    with torch.profiler.profile(activities=activities, record_shapes=False) as prof:
+                        if args.gpu is not None:
+                            images = images.cuda(args.gpu, non_blocking=True)
+
+                        # compute output
+                        output = model(images)
+
+                        # sync for time measurement
+                        if args.gpu is not None:
+                            torch.cuda.synchronize(args.gpu)
+
+                    if profiling:
+                        torch.save(prof.key_averages().table(sort_by=prof_sort), './profiling.pt')
+
+                # D2H
+                output = output.cpu()
+
+                # measure elapsed time
+                duration_eval = time.time() - start_time
+                batch_time.update(duration_eval)
+
+                loss = criterion(output.float(), target)
+
+                # measure accuracy and record loss
+                acc1, acc5 = accuracy(output.float(), target, topk=(1, 5))
+                losses.update(loss.item(), images.size(0))
+                top1.update(acc1[0], images.size(0))
+                top5.update(acc5[0], images.size(0))
+
+                if i % args.print_freq == 0:
+                    progress.display(i + 1)
+
+                # exclude first iteration for calculating througput
+                if i >= 1:
+                    throughput += args.batch_size / duration_eval
+
+                if i == (args.num_iterations - 1) and args.num_iterations >= 2:
+                    print('Evalution performance: batch size:%d, throughput:%.2f image/sec, Acc@1:%.2f, Acc@5:%.2f'
+                        % (args.batch_size, throughput / (args.num_iterations - 1), top1.avg, top5.avg))
+                    sys.exit(0)
+                elif args.num_iterations == 0 and i == len(val_loader) - 1:
+                    print('Evalution performance: batch size:%d, throughput:%.2f image/sec, Acc@1:%.2f, Acc@5:%.2f'
+                        % (args.batch_size, throughput / (len(val_loader) - 2), top1.avg, top5.avg))
+                    if args.tensorboard is None:
+                        sys.exit(0)
+
+    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
+    losses = AverageMeter('Loss', ':.4e', Summary.NONE)
+    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
+    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
+    progress = ProgressMeter(
+        len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    autocast_dtype = torch.float32
+    if args.fp16:
+        autocast_dtype = torch.float16
+    elif args.bf16:
+        autocast_dtype = torch.bfloat16
+
+    if args.jit_trace and not args.load:
+        trace_input = torch.randn(args.batch_size, 3, 224, 224).to(args.xpu)
+        print('jit trace')
+        # TODO: sometimes got -997 issue, JIRA: https://jira.devtools.intel.com/browse/GSD-1869
+        with torch.xpu.amp.autocast(enabled=use_autocast, dtype=autocast_dtype, cache_enabled=False):
+            model = torch.jit.trace(model, trace_input)
+
+    if args.save:
+        if args.jit_trace:
+            torch.jit.save(model, args.save)
+        else:
+            torch.save(model, args.save)
+    run_validate(val_loader, model, autocast_dtype)
+
+    progress.display_summary()
+
+    if args.tensorboard:
+        draw_tensorboard(epoch, None, top1.avg, top5.avg, 'val', args)
+
+    return top1.avg
+
+def validate_quantization(val_loader, model, criterion, profiling, args):
+    batch_time = AverageMeter('Time', ':6.3f', Summary.NONE)
+    losses = AverageMeter('Loss', ':.4e', Summary.NONE)
+    top1 = AverageMeter('Acc@1', ':6.2f', Summary.AVERAGE)
+    top5 = AverageMeter('Acc@5', ':6.2f', Summary.AVERAGE)
+    progress = ProgressMeter(
+        len(val_loader) + (args.distributed and (len(val_loader.sampler) * args.world_size < len(val_loader.dataset))),
+        [batch_time, losses, top1, top5],
+        prefix='Test: ')
+
+    # switch to evaluate mode
+    model.eval()
+
+    # record throughput
+    throughput = 0.0
+
+    with torch.inference_mode():
+        for i, (images, target) in enumerate(val_loader):
+            if args.xpu is not None and args.benchmark == 1:
+                images = images.to(args.xpu)
+
+            start = time.time()
+            with torch.autograd.profiler_legacy.profile(enabled=profiling, use_xpu=True, record_shapes=False) as prof:
+
+                if args.xpu is not None and args.benchmark == 0:
+                    images = images.to(args.xpu)
+
+                # compute output
+                output = model(images)
+
+                # D2H
+                output = output.to("cpu")
+
+                # sync for time measurement
+                torch.xpu.synchronize(args.xpu)
+
+                # measure elapsed time
+                end = time.time()
+                batch_time.update(end - start)
+                duration_eval = end - start
+
+            if profiling:
+                torch.save(prof.key_averages().table(sort_by="self_xpu_time_total"), './profiling.int8.inf.pt')
+                torch.save(prof.table(sort_by="id", row_limit=100000), './profiling.detailed.int8.inf.pt')
+
+            loss = criterion(output, target)
+
+            # measure accuracy and record loss
+            acc1, acc5 = accuracy(output, target, topk=(1, 5))
+            losses.update(loss.item(), images.size(0))
+            top1.update(acc1[0], images.size(0))
+            top5.update(acc5[0], images.size(0))
+
+            if i % args.print_freq == 0:
+                progress.display(i + 1)
+
+            # exclude first iteration for calculating througput
+            perf_start_iter = 1
+            if args.benchmark == 1 and args.num_iterations >= 500:
+                perf_start_iter = math.floor(args.num_iterations * 0.7)
+            if i >= perf_start_iter:
+                throughput += args.batch_size / duration_eval
+
+            if i == (args.num_iterations - 1) and args.num_iterations >= 2:
+                print('Quantization Evalution performance: batch size:%d, throughput:%.2f image/sec, Acc@1:%.2f, Acc@5:%.2f'
+                    % (args.batch_size, throughput / (args.num_iterations - perf_start_iter), top1.avg, top5.avg))
+                sys.exit(0)
+            elif args.num_iterations == 0 and i == len(val_loader) - 1:
+                print('Quantization Evalution performance: batch size:%d, throughput:%.2f image/sec, Acc@1:%.2f, Acc@5:%.2f'
+                    % (args.batch_size, throughput / (len(val_loader) - 2), top1.avg, top5.avg))
+                sys.exit(0)
+
+        progress.display_summary()
+
+    return top1.avg
+
+def save_checkpoint(state, is_best, filename='checkpoint.pth.tar'):
+    torch.save(state, filename)
+    if is_best:
+        shutil.copyfile(filename, 'model_best.pth.tar')
+
+def qnormalize(tensor, mean, std, scl):
+    dtype = tensor.dtype
+    mean = torch.as_tensor(mean, dtype=dtype, device=tensor.device)
+    std = torch.as_tensor(std, dtype=dtype, device=tensor.device)
+    if mean.ndim == 1:
+        mean = mean[:, None, None]
+    if std.ndim == 1:
+        std = std[:, None, None]
+
+    tensor.sub_(mean).div_(std)#.mul_(scale)#tensor.sub_(255 * mean).mul(128/255*(1/(1-0.406)))
+    out = torch.quantize_per_tensor(tensor, scale=scl, zero_point=0, dtype=torch.qint8)
+    return out
+
+def compute_scale(val_loader_com):
+    for i, (input, target) in enumerate(val_loader_com):
+        scale = 1.0 / (128 / torch.max(input))
+        return scale
+
+class QNormalize(object):
+    def __init__(self, mean, std, scale):
+        self.mean = mean
+        self.std = std
+        self.scale = scale
+
+    def __call__(self, tensor):
+        return qnormalize(tensor, self.mean, self.std, self.scale)
+
+class Summary(Enum):
+    NONE = 0
+    AVERAGE = 1
+    SUM = 2
+    COUNT = 3
+
+class AverageMeter(object):
+    """Computes and stores the average and current value"""
+    def __init__(self, name, fmt=':f', summary_type=Summary.AVERAGE):
+        self.name = name
+        self.fmt = fmt
+        self.summary_type = summary_type
+        self.reset()
+
+    def reset(self):
+        self.val = 0
+        self.avg = 0
+        self.sum = 0
+        self.count = 0
+
+    def update(self, val, n=1):
+        self.val = val
+        self.sum += val * n
+        self.count += n
+        self.avg = self.sum / self.count
+
+    def all_reduce(self):
+        total = torch.FloatTensor([self.sum, self.count])
+        dist.all_reduce(total, dist.ReduceOp.SUM, async_op=False)
+        self.sum, self.count = total.tolist()
+        self.avg = self.sum / self.count
+
+    def __str__(self):
+        fmtstr = '{name} {val' + self.fmt + '} ({avg' + self.fmt + '})'
+        return fmtstr.format(**self.__dict__)
+
+    def summary(self):
+        fmtstr = ''
+        if self.summary_type is Summary.NONE:
+            fmtstr = ''
+        elif self.summary_type is Summary.AVERAGE:
+            fmtstr = '{name} {avg:.3f}'
+        elif self.summary_type is Summary.SUM:
+            fmtstr = '{name} {sum:.3f}'
+        elif self.summary_type is Summary.COUNT:
+            fmtstr = '{name} {count:.3f}'
+        else:
+            raise ValueError('invalid summary type %r' % self.summary_type)
+
+        return fmtstr.format(**self.__dict__)
+
+class ProgressMeter(object):
+    def __init__(self, num_batches, meters, prefix=""):
+        self.batch_fmtstr = self._get_batch_fmtstr(num_batches)
+        self.meters = meters
+        self.prefix = prefix
+
+    def display(self, batch):
+        entries = [self.prefix + self.batch_fmtstr.format(batch)]
+        entries += [str(meter) for meter in self.meters]
+        print('\t'.join(entries))
+
+    def display_summary(self):
+        entries = [" *"]
+        entries += [meter.summary() for meter in self.meters]
+        print(' '.join(entries))
+
+    def _get_batch_fmtstr(self, num_batches):
+        num_digits = len(str(num_batches // 1))
+        fmt = '{:' + str(num_digits) + 'd}'
+        return '[' + fmt + '/' + fmt.format(num_batches) + ']'
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the accuracy over the k top predictions for the specified values of k"""
+    with torch.no_grad():
+        maxk = max(topk)
+        batch_size = target.size(0)
+
+        _, pred = output.topk(maxk, 1, True, True)
+        pred = pred.t()
+        pred = pred.cpu()
+        correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+        res = []
+        for k in topk:
+            correct_k = correct[:k].reshape(-1).float().sum(0, keepdim=True)
+            res.append(correct_k.mul_(100.0 / batch_size))
+        return res
+
+def draw_tensorboard(num_epoch, avg_loss, avg_acc1, avg_acc5, mode, args):
+    if mode == 'train':
+        writer.add_scalar('training: learning rate', args.lr, num_epoch)
+        writer.add_scalar('training: loss', avg_loss, num_epoch)
+        writer.add_scalar('training: top1 acc', avg_acc1, num_epoch)
+        writer.add_scalar('training: top5 acc', avg_acc5, num_epoch)
+    else:
+        writer.add_scalar('val: top1 acc', avg_acc1, num_epoch)
+        writer.add_scalar('val: top5 acc', avg_acc5, num_epoch)
+
+if __name__ == '__main__':
+    main()
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/datasets.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/datasets.py
new file mode 100644
index 000000000..954a54d5c
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/datasets.py
@@ -0,0 +1,96 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Benchmark dataset utilities.
+"""
+
+from abc import abstractmethod
+import os
+
+import tensorflow as tf
+
+import preprocessing
+
+IMAGENET_NUM_TRAIN_IMAGES = 1281167
+IMAGENET_NUM_VAL_IMAGES = 50000
+IMAGENET_NUM_CLASSES = 1000
+
+class Dataset(object):
+  """Abstract class for cnn benchmarks dataset."""
+
+  def __init__(self, name, data_dir=None):
+    self.name = name
+    if data_dir is None:
+      raise ValueError('Data directory not specified')
+    self.data_dir = data_dir
+
+  def tf_record_pattern(self, subset):
+    return os.path.join(self.data_dir, '%s-*-of-*' % subset)
+
+  def reader(self):
+    return tf.compat.v1.TFRecordReader()
+
+  @abstractmethod
+  def num_classes(self):
+    pass
+
+  @abstractmethod
+  def num_examples_per_epoch(self, subset):
+    pass
+
+  def __str__(self):
+    return self.name
+
+
+class ImagenetData(Dataset):
+
+  def __init__(self, data_dir=None):
+    super(ImagenetData, self).__init__('ImageNet', data_dir)
+
+  def num_classes(self):
+    return IMAGENET_NUM_CLASSES
+
+  def num_examples_per_epoch(self, subset='train'):
+    if subset == 'train':
+      return IMAGENET_NUM_TRAIN_IMAGES
+    elif subset == 'validation':
+      return IMAGENET_NUM_VAL_IMAGES
+    elif subset == 'calibrate' or subset == 'calibration':
+      return 100
+    else:
+      raise ValueError('Invalid data subset "%s"' % subset)
+
+  def get_image_preprocessor(self):
+    return preprocessing.RecordInputImagePreprocessor
\ No newline at end of file
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/eval_image_classifier_inference.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/eval_image_classifier_inference.py
new file mode 100644
index 000000000..a8cc6bd39
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/eval_image_classifier_inference.py
@@ -0,0 +1,273 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+import time
+import sys
+from argparse import ArgumentParser
+
+import tensorflow as tf
+from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
+from tensorflow.python.framework import dtypes
+
+import datasets
+import numpy as np
+
+INPUTS = 'input_tensor'
+OUTPUTS = 'softmax_tensor'
+
+RESNET_IMAGE_SIZE = 224
+
+
+class eval_classifier_optimized_graph:
+  """Evaluate image classifier with optimized TensorFlow graph"""
+
+  def __init__(self):
+
+    arg_parser = ArgumentParser(description='Parse args')
+
+    arg_parser.add_argument('-b', "--batch-size",
+                            help="Specify the batch size. If this " \
+                                 "parameter is not specified or is -1, the " \
+                                 "largest ideal batch size for the model will " \
+                                 "be used.",
+                            dest="batch_size", type=int, default=-1)
+
+    arg_parser.add_argument('-e', "--num-inter-threads",
+                            help='The number of inter-thread.',
+                            dest='num_inter_threads', type=int, default=0)
+
+    arg_parser.add_argument('-a', "--num-intra-threads",
+                            help='The number of intra-thread.',
+                            dest='num_intra_threads', type=int, default=0)
+
+    arg_parser.add_argument('-m', "--model-name",
+                            help='Specify the model name to run benchmark for',
+                            dest='model_name')
+
+    arg_parser.add_argument('-g', "--input-graph",
+                            help='Specify the input graph for the transform tool',
+                            dest='input_graph')
+
+    arg_parser.add_argument('-d', "--data-location",
+                            help='Specify the location of the data. '
+                                 'If this parameter is not specified, '
+                                 'the benchmark will use random/dummy data.',
+                            dest="data_location", default=None)
+
+    arg_parser.add_argument('-r', "--accuracy-only",
+                            help='For accuracy measurement only.',
+                            dest='accuracy_only', action='store_true')
+    arg_parser.add_argument('--calibrate', dest='calibrate',
+                            help='Run accuracy with calibration data,'
+                                 'to generate min_max ranges, calibrate=[True/False]',
+                            type=bool, default=False)
+    arg_parser.add_argument("--results-file-path",
+                            help="File path for the inference results",
+                            dest="results_file_path", default=None)
+    arg_parser.add_argument("--warmup-steps", type=int, default=10,
+                            help="number of warmup steps")
+    arg_parser.add_argument("--steps", type=int, default=50,
+                            help="number of steps")
+
+    arg_parser.add_argument(
+      '--data-num-inter-threads', dest='data_num_inter_threads',
+      help='number threads across operators',
+      type=int, default=32)
+    arg_parser.add_argument(
+      '--data-num-intra-threads', dest='data_num_intra_threads',
+      help='number threads for data layer operator',
+      type=int, default=14)
+    arg_parser.add_argument(
+      '--num-cores', dest='num_cores',
+      help='number of cores',
+      type=int, default=28)
+
+    self.args = arg_parser.parse_args()
+    # validate the arguements
+    self.validate_args()
+
+  def write_results_output(self, predictions, filenames, labels):
+    # If a results_file_path is provided, write the predictions to the file
+    if self.args.results_file_path:
+      top_predictions = np.argmax(predictions, 1)
+      with open(self.args.results_file_path, "a") as fp:
+        for filename, expected_label, top_prediction in zip(filenames, labels, top_predictions):
+          fp.write("{},{},{}\n".format(filename, expected_label, top_prediction))
+
+  def run(self):
+    """run benchmark with optimized graph"""
+
+    print("Run inference")
+
+    data_config = tf.compat.v1.ConfigProto()
+    data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads
+    data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads
+    data_config.use_per_session_threads = 1
+
+    infer_config = tf.compat.v1.ConfigProto()
+    infer_config.intra_op_parallelism_threads = self.args.num_intra_threads
+    infer_config.inter_op_parallelism_threads = self.args.num_inter_threads
+    infer_config.use_per_session_threads = 1
+
+    data_graph = tf.Graph()
+    with data_graph.as_default():
+      if (self.args.data_location):
+        print("Inference with real data.")
+        if self.args.calibrate:
+            subset = 'calibration'
+        else:
+            subset = 'validation'
+        dataset = datasets.ImagenetData(self.args.data_location)
+        preprocessor = dataset.get_image_preprocessor()(
+            RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size,
+            num_cores=self.args.num_cores,
+            resize_method='crop')
+
+        images, labels, filenames = preprocessor.minibatch(dataset, subset=subset)
+
+        # If a results file path is provided, then start the prediction output file
+        if self.args.results_file_path:
+          with open(self.args.results_file_path, "w+") as fp:
+            fp.write("filename,actual,prediction\n")
+      else:
+        print("Inference with dummy data.")
+        input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3]
+        #input_shape = [self.args.batch_size, 3, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE]
+        images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.bfloat16, name='synthetic_images')
+
+    infer_graph = tf.Graph()
+    with infer_graph.as_default():
+      graph_def = tf.compat.v1.GraphDef()
+      with tf.compat.v1.gfile.FastGFile(self.args.input_graph, 'rb') as input_file:
+        input_graph_content = input_file.read()
+        graph_def.ParseFromString(input_graph_content)
+
+      output_graph = optimize_for_inference(graph_def, [INPUTS], 
+                              [OUTPUTS], dtypes.bfloat16.as_datatype_enum, False)
+      tf.import_graph_def(output_graph, name='')
+
+    # Definite input and output Tensors for detection_graph
+    input_tensor = infer_graph.get_tensor_by_name('input_tensor:0')
+    output_tensor = infer_graph.get_tensor_by_name('softmax_tensor:0')
+
+    data_sess  = tf.compat.v1.Session(graph=data_graph,  config=data_config)
+    infer_sess = tf.compat.v1.Session(graph=infer_graph, config=infer_config)
+
+    num_processed_images = 0
+    num_remaining_images = dataset.num_examples_per_epoch(subset=subset) - num_processed_images \
+        if self.args.data_location else datasets.IMAGENET_NUM_VAL_IMAGES
+
+    if (not self.args.accuracy_only):
+      iteration = 0
+      warm_up_iteration = self.args.warmup_steps
+      total_run = self.args.steps
+      total_time = 0
+
+      while num_remaining_images >= self.args.batch_size and iteration < total_run:
+        iteration += 1
+        tf_filenames = None
+        np_labels = None
+        data_load_start = time.time()
+        if self.args.results_file_path:
+          image_np, np_labels, tf_filenames = data_sess.run([images, labels, filenames])
+        else:
+          image_np = data_sess.run(images)
+
+        data_load_time = time.time() - data_load_start
+
+        num_processed_images += self.args.batch_size
+        num_remaining_images -= self.args.batch_size
+
+        start_time = time.time()
+        predictions = infer_sess.run(output_tensor, feed_dict={input_tensor: image_np})
+        time_consume = time.time() - start_time
+
+        # Write out the file name, expected label, and top prediction
+        self.write_results_output(predictions, tf_filenames, np_labels)
+
+        # only add data loading time for real data, not for dummy data
+        if self.args.data_location:
+          time_consume += data_load_time
+
+        print('Iteration %d: %.6f sec' % (iteration, time_consume))
+        if iteration > warm_up_iteration:
+          total_time += time_consume
+
+      time_average = total_time / (iteration - warm_up_iteration)
+      print('Average time: %.6f sec' % (time_average))
+
+      print('Batch size = %d' % self.args.batch_size)
+      if (self.args.batch_size == 1):
+        print('Latency: %.3f ms' % (time_average * 1000))
+      # print throughput for both batch size 1 and 128
+      print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average))
+
+    else: # accuracy check
+      total_accuracy1, total_accuracy5 = (0.0, 0.0)
+
+      while num_remaining_images >= self.args.batch_size:
+        # Reads and preprocess data
+        tf_filenames = None
+        if self.args.results_file_path:
+          np_images, np_labels, tf_filenames = data_sess.run([images, labels, filenames])
+        else:
+          np_images, np_labels = data_sess.run([images, labels])
+        num_processed_images += self.args.batch_size
+        num_remaining_images -= self.args.batch_size
+
+        start_time = time.time()
+        # Compute inference on the preprocessed data
+        predictions = infer_sess.run(output_tensor,
+                               {input_tensor: np_images})
+        elapsed_time = time.time() - start_time
+
+        # Write out the file name, expected label, and top prediction
+        self.write_results_output(predictions, tf_filenames, np_labels)
+
+        with tf.Graph().as_default() as accu_graph:
+          accuracy1 = tf.reduce_sum(
+            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.cast(tf.constant(predictions), tf.float32),
+                                   targets=tf.constant(np_labels), k=1), tf.float32))
+
+          accuracy5 = tf.reduce_sum(
+            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.cast(tf.constant(predictions), tf.float32),
+                                   targets=tf.constant(np_labels), k=5), tf.float32))
+          with tf.compat.v1.Session() as accu_sess:
+            np_accuracy1, np_accuracy5 = accu_sess.run([accuracy1, accuracy5])
+
+          total_accuracy1 += np_accuracy1
+          total_accuracy5 += np_accuracy5
+
+        print("Iteration time: %0.4f ms" % elapsed_time)
+        print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \
+                  % (num_processed_images, total_accuracy1 / num_processed_images,
+                     total_accuracy5 / num_processed_images))
+
+  def validate_args(self):
+    """validate the arguments"""
+
+    if not self.args.data_location:
+      if self.args.accuracy_only:
+        raise ValueError("You must use real data for accuracy measurement.")
+
+
+if __name__ == "__main__":
+  evaluate_opt_graph = eval_classifier_optimized_graph()
+  evaluate_opt_graph.run()
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/preprocessing.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/preprocessing.py
new file mode 100644
index 000000000..65d819895
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/bfloat16/preprocessing.py
@@ -0,0 +1,178 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.data.experimental import parallel_interleave
+from tensorflow.python.data.experimental import map_and_batch
+from tensorflow.python.platform import gfile
+
+
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+  """
+  # Dense features in Example proto.
+  feature_map = {
+    'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
+                                        default_value=''),
+    'image/class/label': tf.io.FixedLenFeature([1], dtype=tf.int64,
+                                            default_value=-1),
+    'image/filename': tf.io.FixedLenFeature([], dtype=tf.string,
+                                         default_value="")
+  }
+  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+    {k: sparse_float32 for k in ['image/object/bbox/xmin',
+                                 'image/object/bbox/ymin',
+                                 'image/object/bbox/xmax',
+                                 'image/object/bbox/ymax']})
+
+  features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+  filename = tf.cast(features['image/filename'], dtype=tf.string)
+
+  return features['image/encoded'], label, filename
+
+
+def eval_image(image, height, width, resize_method,
+               central_fraction=0.875, scope=None):
+
+  with tf.compat.v1.name_scope('eval_image'):
+    if resize_method == 'crop':
+      shape = tf.shape(input=image)
+      image = tf.cond(pred=tf.less(shape[0], shape[1]),
+                      true_fn=lambda: tf.image.resize(image,
+                                                     tf.convert_to_tensor(value=[256, 256 * shape[1] / shape[0]],
+                                                                          dtype=tf.int32)),
+                      false_fn=lambda: tf.image.resize(image,
+                                                     tf.convert_to_tensor(value=[256 * shape[0] / shape[1], 256],
+                                                                          dtype=tf.int32)))
+
+      shape = tf.shape(input=image)
+      y0 = (shape[0] - height) // 2
+      x0 = (shape[1] - width) // 2
+      distorted_image = tf.image.crop_to_bounding_box(image, y0, x0, height, width)
+      distorted_image.set_shape([height, width, 3])
+      means = tf.broadcast_to([123.68, 116.78, 103.94], tf.shape(input=distorted_image))
+      return distorted_image - means
+    else:  # bilinear
+      if image.dtype != tf.float32:
+        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+      # Crop the central region of the image with an area containing 87.5% of
+      # the original image.
+      if central_fraction:
+        image = tf.image.central_crop(image, central_fraction=central_fraction)
+
+      if height and width:
+        # Resize the image to the specified height and width.
+        image = tf.expand_dims(image, 0)
+        image = tf.image.resize(image, [height, width],
+                                         method=tf.image.ResizeMethod.BILINEAR)
+        image = tf.squeeze(image, [0])
+      image = tf.subtract(image, 0.5)
+      image = tf.multiply(image, 2.0)
+      return image
+
+class RecordInputImagePreprocessor(object):
+  """Preprocessor for images with RecordInput format."""
+
+  def __init__(self,
+               height,
+               width,
+               batch_size,
+               num_cores,
+               resize_method="bilinear"):
+
+    self.height = height
+    self.width = width
+    self.batch_size = batch_size
+    self.num_cores = num_cores
+    self.resize_method = resize_method
+
+  def parse_and_preprocess(self, value):
+    # parse
+    image_buffer, label_index, filename = parse_example_proto(value)
+    # preprocess
+    image = tf.image.decode_jpeg(
+      image_buffer, channels=3, fancy_upscaling=False, dct_method='INTEGER_FAST')
+    image = eval_image(image, self.height, self.width, self.resize_method)
+    return (image, label_index, filename)
+
+  def minibatch(self, dataset, subset, cache_data=False):
+
+    with tf.compat.v1.name_scope('batch_processing'):
+
+      glob_pattern = dataset.tf_record_pattern(subset)
+      file_names = gfile.Glob(glob_pattern)
+      if not file_names:
+        raise ValueError('Found no files in --data_dir matching: {}'
+                         .format(glob_pattern))
+      ds = tf.data.TFRecordDataset.list_files(file_names)
+
+      ds = ds.apply(
+        parallel_interleave(
+          tf.data.TFRecordDataset, cycle_length=self.num_cores, block_length=5,
+          sloppy=True,
+          buffer_output_elements=10000, prefetch_input_elements=10000))
+
+      if cache_data:
+        ds = ds.take(1).cache().repeat()
+
+      ds = ds.prefetch(buffer_size=10000)
+      #ds = ds.prefetch(buffer_size=self.batch_size)
+
+      # num of parallel batches not greater than 56
+      max_num_parallel_batches = min(56, 2 * self.num_cores)
+      ds = ds.apply(
+        map_and_batch(
+          map_func=self.parse_and_preprocess,
+          batch_size=self.batch_size,
+          num_parallel_batches=max_num_parallel_batches,
+          num_parallel_calls=None))
+
+      ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+      ds_iterator = tf.compat.v1.data.make_one_shot_iterator(ds)
+      images, labels, filename = ds_iterator.get_next()
+      # reshape
+      labels = tf.reshape(labels, [self.batch_size])
+      filename = tf.reshape(filename, [self.batch_size])
+
+      return images, labels, filename
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py
index 27351dc9a..ae130f230 100644
--- a/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/eval_image_classifier_inference.py
@@ -34,6 +34,16 @@
 
 RESNET_IMAGE_SIZE = 224
 
+import os
+if os.environ['PRECISION']=='fp16':
+  tf_dtype=tf.float16
+  tf_py_frame_dtype=dtypes.float16
+elif os.environ['PRECISION']=='bfloat16':
+  tf_dtype=tf.bfloat16
+  tf_py_frame_dtype=dtypes.bfloat16
+else:
+  tf_dtype=tf.float32
+  tf_py_frame_dtype=dtypes.float32
 
 class eval_classifier_optimized_graph:
   """Evaluate image classifier with optimized TensorFlow graph"""
@@ -161,7 +171,7 @@ def run(self):
       else:
         print("Inference with dummy data.")
         input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3]
-        images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images')
+        images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf_dtype, name='synthetic_images')
 
     infer_graph = tf.Graph()
     with infer_graph.as_default():
@@ -254,11 +264,11 @@ def run(self):
 
         with tf.Graph().as_default() as accu_graph:
           accuracy1 = tf.reduce_sum(
-            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.constant(predictions),
+            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.constant(predictions, dtype=tf.float32),
                                    targets=tf.constant(np_labels), k=1), tf.float32))
 
           accuracy5 = tf.reduce_sum(
-            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.constant(predictions),
+            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.constant(predictions, dtype=tf.float32),
                                    targets=tf.constant(np_labels), k=5), tf.float32))
           with tf.compat.v1.Session(config=infer_config) as accu_sess:
             np_accuracy1, np_accuracy5 = accu_sess.run([accuracy1, accuracy5])
@@ -273,7 +283,6 @@ def run(self):
 
   def validate_args(self):
     """validate the arguments"""
-
     if not self.args.data_location:
       if self.args.accuracy_only:
         raise ValueError("You must use real data for accuracy measurement.")
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/__init__.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/__init__.py
new file mode 100644
index 000000000..159180624
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/__init__.py
@@ -0,0 +1,20 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: EPL-2.0
+#
+
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/datasets.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/datasets.py
new file mode 100644
index 000000000..954a54d5c
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/datasets.py
@@ -0,0 +1,96 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Benchmark dataset utilities.
+"""
+
+from abc import abstractmethod
+import os
+
+import tensorflow as tf
+
+import preprocessing
+
+IMAGENET_NUM_TRAIN_IMAGES = 1281167
+IMAGENET_NUM_VAL_IMAGES = 50000
+IMAGENET_NUM_CLASSES = 1000
+
+class Dataset(object):
+  """Abstract class for cnn benchmarks dataset."""
+
+  def __init__(self, name, data_dir=None):
+    self.name = name
+    if data_dir is None:
+      raise ValueError('Data directory not specified')
+    self.data_dir = data_dir
+
+  def tf_record_pattern(self, subset):
+    return os.path.join(self.data_dir, '%s-*-of-*' % subset)
+
+  def reader(self):
+    return tf.compat.v1.TFRecordReader()
+
+  @abstractmethod
+  def num_classes(self):
+    pass
+
+  @abstractmethod
+  def num_examples_per_epoch(self, subset):
+    pass
+
+  def __str__(self):
+    return self.name
+
+
+class ImagenetData(Dataset):
+
+  def __init__(self, data_dir=None):
+    super(ImagenetData, self).__init__('ImageNet', data_dir)
+
+  def num_classes(self):
+    return IMAGENET_NUM_CLASSES
+
+  def num_examples_per_epoch(self, subset='train'):
+    if subset == 'train':
+      return IMAGENET_NUM_TRAIN_IMAGES
+    elif subset == 'validation':
+      return IMAGENET_NUM_VAL_IMAGES
+    elif subset == 'calibrate' or subset == 'calibration':
+      return 100
+    else:
+      raise ValueError('Invalid data subset "%s"' % subset)
+
+  def get_image_preprocessor(self):
+    return preprocessing.RecordInputImagePreprocessor
\ No newline at end of file
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/eval_image_classifier_inference.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/eval_image_classifier_inference.py
new file mode 100644
index 000000000..4b99e90f3
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/eval_image_classifier_inference.py
@@ -0,0 +1,274 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+import os
+import time
+import sys
+from argparse import ArgumentParser
+
+import tensorflow as tf
+from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
+from tensorflow.python.framework import dtypes
+
+import datasets
+import numpy as np
+
+INPUTS = 'input_tensor'
+OUTPUTS = 'softmax_tensor'
+
+RESNET_IMAGE_SIZE = 224
+
+
+class eval_classifier_optimized_graph:
+  """Evaluate image classifier with optimized TensorFlow graph"""
+
+  def __init__(self):
+
+    arg_parser = ArgumentParser(description='Parse args')
+
+    arg_parser.add_argument('-b', "--batch-size",
+                            help="Specify the batch size. If this " \
+                                 "parameter is not specified or is -1, the " \
+                                 "largest ideal batch size for the model will " \
+                                 "be used.",
+                            dest="batch_size", type=int, default=-1)
+
+    arg_parser.add_argument('-e', "--num-inter-threads",
+                            help='The number of inter-thread.',
+                            dest='num_inter_threads', type=int, default=0)
+
+    arg_parser.add_argument('-a', "--num-intra-threads",
+                            help='The number of intra-thread.',
+                            dest='num_intra_threads', type=int, default=0)
+
+    arg_parser.add_argument('-m', "--model-name",
+                            help='Specify the model name to run benchmark for',
+                            dest='model_name')
+
+    arg_parser.add_argument('-g', "--input-graph",
+                            help='Specify the input graph for the transform tool',
+                            dest='input_graph')
+
+    arg_parser.add_argument('-d', "--data-location",
+                            help='Specify the location of the data. '
+                                 'If this parameter is not specified, '
+                                 'the benchmark will use random/dummy data.',
+                            dest="data_location", default=None)
+
+    arg_parser.add_argument('-r', "--accuracy-only",
+                            help='For accuracy measurement only.',
+                            dest='accuracy_only', action='store_true')
+    arg_parser.add_argument('--calibrate', dest='calibrate',
+                            help='Run accuracy with calibration data,'
+                                 'to generate min_max ranges, calibrate=[True/False]',
+                            type=bool, default=False)
+    arg_parser.add_argument("--results-file-path",
+                            help="File path for the inference results",
+                            dest="results_file_path", default=None)
+    arg_parser.add_argument("--warmup-steps", type=int, default=10,
+                            help="number of warmup steps")
+    arg_parser.add_argument("--steps", type=int, default=50,
+                            help="number of steps")
+
+    arg_parser.add_argument(
+      '--data-num-inter-threads', dest='data_num_inter_threads',
+      help='number threads across operators',
+      type=int, default=32)
+    arg_parser.add_argument(
+      '--data-num-intra-threads', dest='data_num_intra_threads',
+      help='number threads for data layer operator',
+      type=int, default=14)
+    arg_parser.add_argument(
+      '--num-cores', dest='num_cores',
+      help='number of cores',
+      type=int, default=28)
+
+    self.args = arg_parser.parse_args()
+    # validate the arguements
+    self.validate_args()
+    os.environ['ITEX_AUTO_MIXED_PRECISION'] = '1'
+    os.environ['ITEX_AUTO_MIXED_PRECISION_DATA_TYPE'] = 'FLOAT16'
+
+  def write_results_output(self, predictions, filenames, labels):
+    # If a results_file_path is provided, write the predictions to the file
+    if self.args.results_file_path:
+      top_predictions = np.argmax(predictions, 1)
+      with open(self.args.results_file_path, "a") as fp:
+        for filename, expected_label, top_prediction in zip(filenames, labels, top_predictions):
+          fp.write("{},{},{}\n".format(filename, expected_label, top_prediction))
+
+  def run(self):
+    """run benchmark with optimized graph"""
+
+    print("Run inference")
+
+    data_config = tf.compat.v1.ConfigProto()
+    data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads
+    data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads
+    data_config.use_per_session_threads = 1
+
+    infer_config = tf.compat.v1.ConfigProto()
+    infer_config.intra_op_parallelism_threads = self.args.num_intra_threads
+    infer_config.inter_op_parallelism_threads = self.args.num_inter_threads
+    infer_config.use_per_session_threads = 1
+
+    data_graph = tf.Graph()
+    with data_graph.as_default():
+      if (self.args.data_location):
+        print("Inference with real data.")
+        if self.args.calibrate:
+            subset = 'calibration'
+        else:
+            subset = 'validation'
+        dataset = datasets.ImagenetData(self.args.data_location)
+        preprocessor = dataset.get_image_preprocessor()(
+            RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size,
+            num_cores=self.args.num_cores,
+            resize_method='crop')
+
+        images, labels, filenames = preprocessor.minibatch(dataset, subset=subset)
+
+        # If a results file path is provided, then start the prediction output file
+        if self.args.results_file_path:
+          with open(self.args.results_file_path, "w+") as fp:
+            fp.write("filename,actual,prediction\n")
+      else:
+        print("Inference with dummy data.")
+        input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3]
+        images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images')
+
+    infer_graph = tf.Graph()
+    with infer_graph.as_default():
+      graph_def = tf.compat.v1.GraphDef()
+      with tf.compat.v1.gfile.FastGFile(self.args.input_graph, 'rb') as input_file:
+        input_graph_content = input_file.read()
+        graph_def.ParseFromString(input_graph_content)
+
+      output_graph = optimize_for_inference(graph_def, [INPUTS], 
+                              [OUTPUTS], dtypes.float32.as_datatype_enum, False)
+      tf.import_graph_def(output_graph, name='')
+
+    # Definite input and output Tensors for detection_graph
+    input_tensor = infer_graph.get_tensor_by_name('input_tensor:0')
+    output_tensor = infer_graph.get_tensor_by_name('softmax_tensor:0')
+
+    data_sess  = tf.compat.v1.Session(graph=data_graph,  config=data_config)
+    infer_sess = tf.compat.v1.Session(graph=infer_graph, config=infer_config)
+
+    num_processed_images = 0
+    num_remaining_images = dataset.num_examples_per_epoch(subset=subset) - num_processed_images \
+        if self.args.data_location else datasets.IMAGENET_NUM_VAL_IMAGES
+
+    if (not self.args.accuracy_only):
+      iteration = 0
+      warm_up_iteration = self.args.warmup_steps
+      total_run = self.args.steps
+      total_time = 0
+
+      while num_remaining_images >= self.args.batch_size and iteration < total_run:
+        iteration += 1
+        tf_filenames = None
+        np_labels = None
+        data_load_start = time.time()
+        if self.args.results_file_path:
+          image_np, np_labels, tf_filenames = data_sess.run([images, labels, filenames])
+        else:
+          image_np = data_sess.run(images)
+
+        data_load_time = time.time() - data_load_start
+
+        num_processed_images += self.args.batch_size
+        num_remaining_images -= self.args.batch_size
+
+        start_time = time.time()
+        predictions = infer_sess.run(output_tensor, feed_dict={input_tensor: image_np})
+        time_consume = time.time() - start_time
+
+        # Write out the file name, expected label, and top prediction
+        self.write_results_output(predictions, tf_filenames, np_labels)
+
+        # only add data loading time for real data, not for dummy data
+        if self.args.data_location:
+          time_consume += data_load_time
+
+        print('Iteration %d: %.6f sec' % (iteration, time_consume))
+        if iteration > warm_up_iteration:
+          total_time += time_consume
+
+      time_average = total_time / (iteration - warm_up_iteration)
+      print('Average time: %.6f sec' % (time_average))
+
+      print('Batch size = %d' % self.args.batch_size)
+      if (self.args.batch_size == 1):
+        print('Latency: %.3f ms' % (time_average * 1000))
+      # print throughput for both batch size 1 and 128
+      print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average))
+
+    else: # accuracy check
+      total_accuracy1, total_accuracy5 = (0.0, 0.0)
+
+      while num_remaining_images >= self.args.batch_size:
+        # Reads and preprocess data
+        tf_filenames = None
+        if self.args.results_file_path:
+          np_images, np_labels, tf_filenames = data_sess.run([images, labels, filenames])
+        else:
+          np_images, np_labels = data_sess.run([images, labels])
+        num_processed_images += self.args.batch_size
+        num_remaining_images -= self.args.batch_size
+
+        start_time = time.time()
+        # Compute inference on the preprocessed data
+        predictions = infer_sess.run(output_tensor,
+                               {input_tensor: np_images})
+        elapsed_time = time.time() - start_time
+
+        # Write out the file name, expected label, and top prediction
+        self.write_results_output(predictions, tf_filenames, np_labels)
+
+        with tf.Graph().as_default() as accu_graph:
+          accuracy1 = tf.reduce_sum(
+            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.cast(tf.constant(predictions), tf.float32),
+                                   targets=tf.constant(np_labels), k=1), tf.float32))
+
+          accuracy5 = tf.reduce_sum(
+            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.cast(tf.constant(predictions), tf.float32),
+                                   targets=tf.constant(np_labels), k=5), tf.float32))
+          with tf.compat.v1.Session() as accu_sess:
+            np_accuracy1, np_accuracy5 = accu_sess.run([accuracy1, accuracy5])
+
+          total_accuracy1 += np_accuracy1
+          total_accuracy5 += np_accuracy5
+
+        print("Iteration time: %0.4f ms" % elapsed_time)
+        print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \
+                  % (num_processed_images, total_accuracy1 / num_processed_images,
+                     total_accuracy5 / num_processed_images))
+
+  def validate_args(self):
+    """validate the arguments"""
+
+    if not self.args.data_location:
+      if self.args.accuracy_only:
+        raise ValueError("You must use real data for accuracy measurement.")
+
+
+if __name__ == "__main__":
+  evaluate_opt_graph = eval_classifier_optimized_graph()
+  evaluate_opt_graph.run()
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/preprocessing.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/preprocessing.py
new file mode 100644
index 000000000..bf81b4188
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp16/preprocessing.py
@@ -0,0 +1,179 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.data.experimental import parallel_interleave
+from tensorflow.python.data.experimental import map_and_batch
+from tensorflow.python.platform import gfile
+
+
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+  """
+  # Dense features in Example proto.
+  feature_map = {
+    'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
+                                        default_value=''),
+    'image/class/label': tf.io.FixedLenFeature([1], dtype=tf.int64,
+                                            default_value=-1),
+    'image/filename': tf.io.FixedLenFeature([], dtype=tf.string,
+                                         default_value="")
+  }
+  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+    {k: sparse_float32 for k in ['image/object/bbox/xmin',
+                                 'image/object/bbox/ymin',
+                                 'image/object/bbox/xmax',
+                                 'image/object/bbox/ymax']})
+
+  features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+  filename = tf.cast(features['image/filename'], dtype=tf.string)
+
+  return features['image/encoded'], label, filename
+
+
+def eval_image(image, height, width, resize_method,
+               central_fraction=0.875, scope=None):
+
+  with tf.compat.v1.name_scope('eval_image'):
+    if resize_method == 'crop':
+      shape = tf.shape(input=image)
+      image = tf.cond(pred=tf.less(shape[0], shape[1]),
+                      true_fn=lambda: tf.image.resize(image,
+                                                     tf.convert_to_tensor(value=[256, 256 * shape[1] / shape[0]],
+                                                                          dtype=tf.int32)),
+                      false_fn=lambda: tf.image.resize(image,
+                                                     tf.convert_to_tensor(value=[256 * shape[0] / shape[1], 256],
+                                                                          dtype=tf.int32)))
+
+      shape = tf.shape(input=image)
+      y0 = (shape[0] - height) // 2
+      x0 = (shape[1] - width) // 2
+      distorted_image = tf.image.crop_to_bounding_box(image, y0, x0, height, width)
+      distorted_image.set_shape([height, width, 3])
+      means = tf.broadcast_to([123.68, 116.78, 103.94], tf.shape(input=distorted_image))
+      return distorted_image - means
+    else:  # bilinear
+      if image.dtype != tf.float32:
+        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+      # Crop the central region of the image with an area containing 87.5% of
+      # the original image.
+      if central_fraction:
+        image = tf.image.central_crop(image, central_fraction=central_fraction)
+
+      if height and width:
+        # Resize the image to the specified height and width.
+        image = tf.expand_dims(image, 0)
+        image = tf.image.resize(image, [height, width],
+                                         method=tf.image.ResizeMethod.BILINEAR)
+        image = tf.squeeze(image, [0])
+      image = tf.subtract(image, 0.5)
+      image = tf.multiply(image, 2.0)
+      return image
+
+class RecordInputImagePreprocessor(object):
+  """Preprocessor for images with RecordInput format."""
+
+  def __init__(self,
+               height,
+               width,
+               batch_size,
+               num_cores,
+               resize_method="bilinear"):
+
+    self.height = height
+    self.width = width
+    self.batch_size = batch_size
+    self.num_cores = num_cores
+    self.resize_method = resize_method
+
+  def parse_and_preprocess(self, value):
+    # parse
+    image_buffer, label_index, filename = parse_example_proto(value)
+    # preprocess
+    image = tf.image.decode_jpeg(
+      image_buffer, channels=3, fancy_upscaling=False, dct_method='INTEGER_FAST')
+    image = eval_image(image, self.height, self.width, self.resize_method)
+    image = tf.transpose(image, perm=[2, 0, 1])
+    return (image, label_index, filename)
+
+  def minibatch(self, dataset, subset, cache_data=False):
+
+    with tf.compat.v1.name_scope('batch_processing'):
+
+      glob_pattern = dataset.tf_record_pattern(subset)
+      file_names = gfile.Glob(glob_pattern)
+      if not file_names:
+        raise ValueError('Found no files in --data_dir matching: {}'
+                         .format(glob_pattern))
+      ds = tf.data.TFRecordDataset.list_files(file_names)
+
+      ds = ds.apply(
+        parallel_interleave(
+          tf.data.TFRecordDataset, cycle_length=self.num_cores, block_length=5,
+          sloppy=True,
+          buffer_output_elements=10000, prefetch_input_elements=10000))
+
+      if cache_data:
+        ds = ds.take(1).cache().repeat()
+
+      ds = ds.prefetch(buffer_size=10000)
+      #ds = ds.prefetch(buffer_size=self.batch_size)
+
+      # num of parallel batches not greater than 56
+      max_num_parallel_batches = min(56, 2 * self.num_cores)
+      ds = ds.apply(
+        map_and_batch(
+          map_func=self.parse_and_preprocess,
+          batch_size=self.batch_size,
+          num_parallel_batches=max_num_parallel_batches,
+          num_parallel_calls=None))
+
+      ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+      ds_iterator = tf.compat.v1.data.make_one_shot_iterator(ds)
+      images, labels, filename = ds_iterator.get_next()
+      # reshape
+      labels = tf.reshape(labels, [self.batch_size])
+      filename = tf.reshape(filename, [self.batch_size])
+
+      return images, labels, filename
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/__init__.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/__init__.py
new file mode 100644
index 000000000..159180624
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/__init__.py
@@ -0,0 +1,20 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: EPL-2.0
+#
+
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/datasets.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/datasets.py
new file mode 100644
index 000000000..954a54d5c
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/datasets.py
@@ -0,0 +1,96 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Benchmark dataset utilities.
+"""
+
+from abc import abstractmethod
+import os
+
+import tensorflow as tf
+
+import preprocessing
+
+IMAGENET_NUM_TRAIN_IMAGES = 1281167
+IMAGENET_NUM_VAL_IMAGES = 50000
+IMAGENET_NUM_CLASSES = 1000
+
+class Dataset(object):
+  """Abstract class for cnn benchmarks dataset."""
+
+  def __init__(self, name, data_dir=None):
+    self.name = name
+    if data_dir is None:
+      raise ValueError('Data directory not specified')
+    self.data_dir = data_dir
+
+  def tf_record_pattern(self, subset):
+    return os.path.join(self.data_dir, '%s-*-of-*' % subset)
+
+  def reader(self):
+    return tf.compat.v1.TFRecordReader()
+
+  @abstractmethod
+  def num_classes(self):
+    pass
+
+  @abstractmethod
+  def num_examples_per_epoch(self, subset):
+    pass
+
+  def __str__(self):
+    return self.name
+
+
+class ImagenetData(Dataset):
+
+  def __init__(self, data_dir=None):
+    super(ImagenetData, self).__init__('ImageNet', data_dir)
+
+  def num_classes(self):
+    return IMAGENET_NUM_CLASSES
+
+  def num_examples_per_epoch(self, subset='train'):
+    if subset == 'train':
+      return IMAGENET_NUM_TRAIN_IMAGES
+    elif subset == 'validation':
+      return IMAGENET_NUM_VAL_IMAGES
+    elif subset == 'calibrate' or subset == 'calibration':
+      return 100
+    else:
+      raise ValueError('Invalid data subset "%s"' % subset)
+
+  def get_image_preprocessor(self):
+    return preprocessing.RecordInputImagePreprocessor
\ No newline at end of file
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/eval_image_classifier_inference.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/eval_image_classifier_inference.py
new file mode 100644
index 000000000..f53c457c8
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/eval_image_classifier_inference.py
@@ -0,0 +1,271 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+import time
+from argparse import ArgumentParser
+
+import tensorflow as tf
+from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
+from tensorflow.python.framework import dtypes
+
+import datasets
+import numpy as np
+
+INPUTS = 'input_tensor'
+OUTPUTS = 'softmax_tensor'
+
+RESNET_IMAGE_SIZE = 224
+
+
+class eval_classifier_optimized_graph:
+  """Evaluate image classifier with optimized TensorFlow graph"""
+
+  def __init__(self):
+
+    arg_parser = ArgumentParser(description='Parse args')
+
+    arg_parser.add_argument('-b', "--batch-size",
+                            help="Specify the batch size. If this " \
+                                 "parameter is not specified or is -1, the " \
+                                 "largest ideal batch size for the model will " \
+                                 "be used.",
+                            dest="batch_size", type=int, default=-1)
+
+    arg_parser.add_argument('-e', "--num-inter-threads",
+                            help='The number of inter-thread.',
+                            dest='num_inter_threads', type=int, default=0)
+
+    arg_parser.add_argument('-a', "--num-intra-threads",
+                            help='The number of intra-thread.',
+                            dest='num_intra_threads', type=int, default=0)
+
+    arg_parser.add_argument('-m', "--model-name",
+                            help='Specify the model name to run benchmark for',
+                            dest='model_name')
+
+    arg_parser.add_argument('-g', "--input-graph",
+                            help='Specify the input graph for the transform tool',
+                            dest='input_graph')
+
+    arg_parser.add_argument('-d', "--data-location",
+                            help='Specify the location of the data. '
+                                 'If this parameter is not specified, '
+                                 'the benchmark will use random/dummy data.',
+                            dest="data_location", default=None)
+
+    arg_parser.add_argument('-r', "--accuracy-only",
+                            help='For accuracy measurement only.',
+                            dest='accuracy_only', action='store_true')
+    arg_parser.add_argument('--calibrate', dest='calibrate',
+                            help='Run accuracy with calibration data,'
+                                 'to generate min_max ranges, calibrate=[True/False]',
+                            type=bool, default=False)
+    arg_parser.add_argument("--results-file-path",
+                            help="File path for the inference results",
+                            dest="results_file_path", default=None)
+    arg_parser.add_argument("--warmup-steps", type=int, default=10,
+                            help="number of warmup steps")
+    arg_parser.add_argument("--steps", type=int, default=50,
+                            help="number of steps")
+
+    arg_parser.add_argument(
+      '--data-num-inter-threads', dest='data_num_inter_threads',
+      help='number threads across operators',
+      type=int, default=32)
+    arg_parser.add_argument(
+      '--data-num-intra-threads', dest='data_num_intra_threads',
+      help='number threads for data layer operator',
+      type=int, default=14)
+    arg_parser.add_argument(
+      '--num-cores', dest='num_cores',
+      help='number of cores',
+      type=int, default=28)
+
+    self.args = arg_parser.parse_args()
+    # validate the arguements
+    self.validate_args()
+
+  def write_results_output(self, predictions, filenames, labels):
+    # If a results_file_path is provided, write the predictions to the file
+    if self.args.results_file_path:
+      top_predictions = np.argmax(predictions, 1)
+      with open(self.args.results_file_path, "a") as fp:
+        for filename, expected_label, top_prediction in zip(filenames, labels, top_predictions):
+          fp.write("{},{},{}\n".format(filename, expected_label, top_prediction))
+
+  def run(self):
+    """run benchmark with optimized graph"""
+
+    print("Run inference")
+
+    data_config = tf.compat.v1.ConfigProto()
+    data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads
+    data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads
+    data_config.use_per_session_threads = 1
+
+    infer_config = tf.compat.v1.ConfigProto()
+    infer_config.intra_op_parallelism_threads = self.args.num_intra_threads
+    infer_config.inter_op_parallelism_threads = self.args.num_inter_threads
+    infer_config.use_per_session_threads = 1
+
+    data_graph = tf.Graph()
+    with data_graph.as_default():
+      if (self.args.data_location):
+        print("Inference with real data.")
+        if self.args.calibrate:
+            subset = 'calibration'
+        else:
+            subset = 'validation'
+        dataset = datasets.ImagenetData(self.args.data_location)
+        preprocessor = dataset.get_image_preprocessor()(
+            RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size,
+            num_cores=self.args.num_cores,
+            resize_method='crop')
+
+        images, labels, filenames = preprocessor.minibatch(dataset, subset=subset)
+
+        # If a results file path is provided, then start the prediction output file
+        if self.args.results_file_path:
+          with open(self.args.results_file_path, "w+") as fp:
+            fp.write("filename,actual,prediction\n")
+      else:
+        print("Inference with dummy data.")
+        input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3]
+        images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images')
+
+    infer_graph = tf.Graph()
+    with infer_graph.as_default():
+      graph_def = tf.compat.v1.GraphDef()
+      with tf.compat.v1.gfile.FastGFile(self.args.input_graph, 'rb') as input_file:
+        input_graph_content = input_file.read()
+        graph_def.ParseFromString(input_graph_content)
+
+      output_graph = optimize_for_inference(graph_def, [INPUTS], 
+                              [OUTPUTS], dtypes.float32.as_datatype_enum, False)
+      tf.import_graph_def(output_graph, name='')
+
+    # Definite input and output Tensors for detection_graph
+    input_tensor = infer_graph.get_tensor_by_name('input_tensor:0')
+    output_tensor = infer_graph.get_tensor_by_name('softmax_tensor:0')
+
+    data_sess  = tf.compat.v1.Session(graph=data_graph,  config=data_config)
+    infer_sess = tf.compat.v1.Session(graph=infer_graph, config=infer_config)
+
+    num_processed_images = 0
+    num_remaining_images = dataset.num_examples_per_epoch(subset=subset) - num_processed_images \
+        if self.args.data_location else datasets.IMAGENET_NUM_VAL_IMAGES
+
+    if (not self.args.accuracy_only):
+      iteration = 0
+      warm_up_iteration = self.args.warmup_steps
+      total_run = self.args.steps
+      total_time = 0
+
+      while num_remaining_images >= self.args.batch_size and iteration < total_run:
+        iteration += 1
+        tf_filenames = None
+        np_labels = None
+        data_load_start = time.time()
+        if self.args.results_file_path:
+          image_np, np_labels, tf_filenames = data_sess.run([images, labels, filenames])
+        else:
+          image_np = data_sess.run(images)
+
+        data_load_time = time.time() - data_load_start
+
+        num_processed_images += self.args.batch_size
+        num_remaining_images -= self.args.batch_size
+
+        start_time = time.time()
+        predictions = infer_sess.run(output_tensor, feed_dict={input_tensor: image_np})
+        time_consume = time.time() - start_time
+
+        # Write out the file name, expected label, and top prediction
+        self.write_results_output(predictions, tf_filenames, np_labels)
+
+        # only add data loading time for real data, not for dummy data
+        if self.args.data_location:
+          time_consume += data_load_time
+
+        print('Iteration %d: %.6f sec' % (iteration, time_consume))
+        if iteration > warm_up_iteration:
+          total_time += time_consume
+
+      time_average = total_time / (iteration - warm_up_iteration)
+      print('Average time: %.6f sec' % (time_average))
+
+      print('Batch size = %d' % self.args.batch_size)
+      if (self.args.batch_size == 1):
+        print('Latency: %.3f ms' % (time_average * 1000))
+      # print throughput for both batch size 1 and 128
+      print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average))
+
+    else: # accuracy check
+      total_accuracy1, total_accuracy5 = (0.0, 0.0)
+
+      while num_remaining_images >= self.args.batch_size:
+        # Reads and preprocess data
+        tf_filenames = None
+        if self.args.results_file_path:
+          np_images, np_labels, tf_filenames = data_sess.run([images, labels, filenames])
+        else:
+          np_images, np_labels = data_sess.run([images, labels])
+        num_processed_images += self.args.batch_size
+        num_remaining_images -= self.args.batch_size
+
+        start_time = time.time()
+        # Compute inference on the preprocessed data
+        predictions = infer_sess.run(output_tensor,
+                               {input_tensor: np_images})
+        elapsed_time = time.time() - start_time
+
+        # Write out the file name, expected label, and top prediction
+        self.write_results_output(predictions, tf_filenames, np_labels)
+
+        with tf.Graph().as_default() as accu_graph:
+          accuracy1 = tf.reduce_sum(
+            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.constant(predictions),
+                                   targets=tf.constant(np_labels), k=1), tf.float32))
+
+          accuracy5 = tf.reduce_sum(
+            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.constant(predictions),
+                                   targets=tf.constant(np_labels), k=5), tf.float32))
+          with tf.compat.v1.Session() as accu_sess:
+            np_accuracy1, np_accuracy5 = accu_sess.run([accuracy1, accuracy5])
+
+          total_accuracy1 += np_accuracy1
+          total_accuracy5 += np_accuracy5
+
+        print("Iteration time: %0.4f ms" % elapsed_time)
+        print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \
+                  % (num_processed_images, total_accuracy1 / num_processed_images,
+                     total_accuracy5 / num_processed_images))
+
+  def validate_args(self):
+    """validate the arguments"""
+
+    if not self.args.data_location:
+      if self.args.accuracy_only:
+        raise ValueError("You must use real data for accuracy measurement.")
+
+
+if __name__ == "__main__":
+  evaluate_opt_graph = eval_classifier_optimized_graph()
+  evaluate_opt_graph.run()
\ No newline at end of file
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/preprocessing.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/preprocessing.py
new file mode 100644
index 000000000..61f43d872
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/fp32/preprocessing.py
@@ -0,0 +1,177 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.data.experimental import parallel_interleave
+from tensorflow.python.data.experimental import map_and_batch
+from tensorflow.python.platform import gfile
+
+
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+  """
+  # Dense features in Example proto.
+  feature_map = {
+    'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
+                                        default_value=''),
+    'image/class/label': tf.io.FixedLenFeature([1], dtype=tf.int64,
+                                            default_value=-1),
+    'image/filename': tf.io.FixedLenFeature([], dtype=tf.string,
+                                         default_value="")
+  }
+  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+    {k: sparse_float32 for k in ['image/object/bbox/xmin',
+                                 'image/object/bbox/ymin',
+                                 'image/object/bbox/xmax',
+                                 'image/object/bbox/ymax']})
+
+  features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+  filename = tf.cast(features['image/filename'], dtype=tf.string)
+
+  return features['image/encoded'], label, filename
+
+
+def eval_image(image, height, width, resize_method,
+               central_fraction=0.875, scope=None):
+
+  with tf.compat.v1.name_scope('eval_image'):
+    if resize_method == 'crop':
+      shape = tf.shape(input=image)
+      image = tf.cond(pred=tf.less(shape[0], shape[1]),
+                      true_fn=lambda: tf.image.resize(image,
+                                                     tf.convert_to_tensor(value=[256, 256 * shape[1] / shape[0]],
+                                                                          dtype=tf.int32)),
+                      false_fn=lambda: tf.image.resize(image,
+                                                     tf.convert_to_tensor(value=[256 * shape[0] / shape[1], 256],
+                                                                          dtype=tf.int32)))
+
+      shape = tf.shape(input=image)
+      y0 = (shape[0] - height) // 2
+      x0 = (shape[1] - width) // 2
+      distorted_image = tf.image.crop_to_bounding_box(image, y0, x0, height, width)
+      distorted_image.set_shape([height, width, 3])
+      means = tf.broadcast_to([123.68, 116.78, 103.94], tf.shape(input=distorted_image))
+      return distorted_image - means
+    else:  # bilinear
+      if image.dtype != tf.float32:
+        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+      # Crop the central region of the image with an area containing 87.5% of
+      # the original image.
+      if central_fraction:
+        image = tf.image.central_crop(image, central_fraction=central_fraction)
+
+      if height and width:
+        # Resize the image to the specified height and width.
+        image = tf.expand_dims(image, 0)
+        image = tf.image.resize(image, [height, width],
+                                         method=tf.image.ResizeMethod.BILINEAR)
+        image = tf.squeeze(image, [0])
+      image = tf.subtract(image, 0.5)
+      image = tf.multiply(image, 2.0)
+      return image
+
+class RecordInputImagePreprocessor(object):
+  """Preprocessor for images with RecordInput format."""
+
+  def __init__(self,
+               height,
+               width,
+               batch_size,
+               num_cores,
+               resize_method="bilinear"):
+
+    self.height = height
+    self.width = width
+    self.batch_size = batch_size
+    self.num_cores = num_cores
+    self.resize_method = resize_method
+
+  def parse_and_preprocess(self, value):
+    # parse
+    image_buffer, label_index, filename = parse_example_proto(value)
+    # preprocess
+    image = tf.image.decode_jpeg(
+      image_buffer, channels=3, fancy_upscaling=False, dct_method='INTEGER_FAST')
+    image = eval_image(image, self.height, self.width, self.resize_method)
+    return (image, label_index, filename)
+
+  def minibatch(self, dataset, subset, cache_data=False):
+
+    with tf.compat.v1.name_scope('batch_processing'):
+
+      glob_pattern = dataset.tf_record_pattern(subset)
+      file_names = gfile.Glob(glob_pattern)
+      if not file_names:
+        raise ValueError('Found no files in --data_dir matching: {}'
+                         .format(glob_pattern))
+      ds = tf.data.TFRecordDataset.list_files(file_names)
+
+      ds = ds.apply(
+        parallel_interleave(
+          tf.data.TFRecordDataset, cycle_length=self.num_cores, block_length=5,
+          sloppy=True,
+          buffer_output_elements=10000, prefetch_input_elements=10000))
+
+      if cache_data:
+        ds = ds.take(1).cache().repeat()
+
+      ds = ds.prefetch(buffer_size=10000)
+      #ds = ds.prefetch(buffer_size=self.batch_size)
+
+      # num of parallel batches not greater than 56
+      max_num_parallel_batches = min(56, 2 * self.num_cores)
+      ds = ds.apply(
+        map_and_batch(
+          map_func=self.parse_and_preprocess,
+          batch_size=self.batch_size,
+          num_parallel_batches=max_num_parallel_batches,
+          num_parallel_calls=None))
+
+      ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+      ds_iterator = tf.compat.v1.data.make_one_shot_iterator(ds)
+      images, labels, filename = ds_iterator.get_next()
+      # reshape
+      labels = tf.reshape(labels, [self.batch_size])
+      filename = tf.reshape(filename, [self.batch_size])
+
+      return images, labels, filename
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/__init__.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/__init__.py
new file mode 100644
index 000000000..159180624
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/__init__.py
@@ -0,0 +1,20 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: EPL-2.0
+#
+
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/datasets.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/datasets.py
new file mode 100644
index 000000000..954a54d5c
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/datasets.py
@@ -0,0 +1,96 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""Benchmark dataset utilities.
+"""
+
+from abc import abstractmethod
+import os
+
+import tensorflow as tf
+
+import preprocessing
+
+IMAGENET_NUM_TRAIN_IMAGES = 1281167
+IMAGENET_NUM_VAL_IMAGES = 50000
+IMAGENET_NUM_CLASSES = 1000
+
+class Dataset(object):
+  """Abstract class for cnn benchmarks dataset."""
+
+  def __init__(self, name, data_dir=None):
+    self.name = name
+    if data_dir is None:
+      raise ValueError('Data directory not specified')
+    self.data_dir = data_dir
+
+  def tf_record_pattern(self, subset):
+    return os.path.join(self.data_dir, '%s-*-of-*' % subset)
+
+  def reader(self):
+    return tf.compat.v1.TFRecordReader()
+
+  @abstractmethod
+  def num_classes(self):
+    pass
+
+  @abstractmethod
+  def num_examples_per_epoch(self, subset):
+    pass
+
+  def __str__(self):
+    return self.name
+
+
+class ImagenetData(Dataset):
+
+  def __init__(self, data_dir=None):
+    super(ImagenetData, self).__init__('ImageNet', data_dir)
+
+  def num_classes(self):
+    return IMAGENET_NUM_CLASSES
+
+  def num_examples_per_epoch(self, subset='train'):
+    if subset == 'train':
+      return IMAGENET_NUM_TRAIN_IMAGES
+    elif subset == 'validation':
+      return IMAGENET_NUM_VAL_IMAGES
+    elif subset == 'calibrate' or subset == 'calibration':
+      return 100
+    else:
+      raise ValueError('Invalid data subset "%s"' % subset)
+
+  def get_image_preprocessor(self):
+    return preprocessing.RecordInputImagePreprocessor
\ No newline at end of file
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/eval_image_classifier_inference.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/eval_image_classifier_inference.py
new file mode 100644
index 000000000..eefdd6c86
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/eval_image_classifier_inference.py
@@ -0,0 +1,281 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: EPL-2.0
+#
+
+import time
+import sys
+from argparse import ArgumentParser
+
+import tensorflow as tf
+from tensorflow.python.tools.optimize_for_inference_lib import optimize_for_inference
+from tensorflow.python.framework import dtypes
+
+import datasets
+from optimize_for_benchmark import optimize_for_benchmark
+import numpy as np
+
+INPUTS = 'input_tensor'
+OUTPUTS = 'softmax_tensor'
+
+RESNET_IMAGE_SIZE = 224
+
+
+class eval_classifier_optimized_graph:
+  """Evaluate image classifier with optimized TensorFlow graph"""
+
+  def __init__(self):
+
+    arg_parser = ArgumentParser(description='Parse args')
+
+    arg_parser.add_argument('-b', "--batch-size",
+                            help="Specify the batch size. If this " \
+                                 "parameter is not specified or is -1, the " \
+                                 "largest ideal batch size for the model will " \
+                                 "be used.",
+                            dest="batch_size", type=int, default=-1)
+
+    arg_parser.add_argument('-e', "--num-inter-threads",
+                            help='The number of inter-thread.',
+                            dest='num_inter_threads', type=int, default=0)
+
+    arg_parser.add_argument('-a', "--num-intra-threads",
+                            help='The number of intra-thread.',
+                            dest='num_intra_threads', type=int, default=0)
+
+    arg_parser.add_argument('-m', "--model-name",
+                            help='Specify the model name to run benchmark for',
+                            dest='model_name')
+
+    arg_parser.add_argument('-g', "--input-graph",
+                            help='Specify the input graph for the transform tool',
+                            dest='input_graph')
+
+    arg_parser.add_argument('-d', "--data-location",
+                            help='Specify the location of the data. '
+                                 'If this parameter is not specified, '
+                                 'the benchmark will use random/dummy data.',
+                            dest="data_location", default=None)
+
+    arg_parser.add_argument('-r', "--accuracy-only",
+                            help='For accuracy measurement only.',
+                            dest='accuracy_only', action='store_true')
+    arg_parser.add_argument('--calibrate', dest='calibrate',
+                            help='Run accuracy with calibration data,'
+                                 'to generate min_max ranges, calibrate=[True/False]',
+                            type=bool, default=False)
+    arg_parser.add_argument("--results-file-path",
+                            help="File path for the inference results",
+                            dest="results_file_path", default=None)
+    arg_parser.add_argument("--warmup-steps", type=int, default=10,
+                            help="number of warmup steps")
+    arg_parser.add_argument("--steps", type=int, default=50,
+                            help="number of steps")
+
+    arg_parser.add_argument(
+      '--data-num-inter-threads', dest='data_num_inter_threads',
+      help='number threads across operators',
+      type=int, default=32)
+    arg_parser.add_argument(
+      '--data-num-intra-threads', dest='data_num_intra_threads',
+      help='number threads for data layer operator',
+      type=int, default=14)
+    arg_parser.add_argument(
+      '--num-cores', dest='num_cores',
+      help='number of cores',
+      type=int, default=28)
+
+    arg_parser.add_argument("--benchmark",
+                            help='Run in benchmark mode.',
+                            dest='benchmark', action='store_true')
+
+    self.args = arg_parser.parse_args()
+    # validate the arguements
+    self.validate_args()
+
+  def write_results_output(self, predictions, filenames, labels):
+    # If a results_file_path is provided, write the predictions to the file
+    if self.args.results_file_path:
+      top_predictions = np.argmax(predictions, 1)
+      with open(self.args.results_file_path, "a") as fp:
+        for filename, expected_label, top_prediction in zip(filenames, labels, top_predictions):
+          fp.write("{},{},{}\n".format(filename, expected_label, top_prediction))
+
+  def run(self):
+    """run benchmark with optimized graph"""
+
+    print("Run inference")
+
+    data_config = tf.compat.v1.ConfigProto()
+    data_config.intra_op_parallelism_threads = self.args.data_num_intra_threads
+    data_config.inter_op_parallelism_threads = self.args.data_num_inter_threads
+    data_config.use_per_session_threads = 1
+
+    infer_config = tf.compat.v1.ConfigProto()
+    infer_config.intra_op_parallelism_threads = self.args.num_intra_threads
+    infer_config.inter_op_parallelism_threads = self.args.num_inter_threads
+    infer_config.use_per_session_threads = 1
+
+    data_graph = tf.Graph()
+    with data_graph.as_default():
+      if (self.args.data_location):
+        print("Inference with real data.")
+        if self.args.calibrate:
+            subset = 'calibration'
+        else:
+            subset = 'validation'
+        dataset = datasets.ImagenetData(self.args.data_location)
+        preprocessor = dataset.get_image_preprocessor()(
+            RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, self.args.batch_size,
+            num_cores=self.args.num_cores,
+            resize_method='crop')
+
+        images, labels, filenames = preprocessor.minibatch(dataset, subset=subset)
+
+        # If a results file path is provided, then start the prediction output file
+        if self.args.results_file_path:
+          with open(self.args.results_file_path, "w+") as fp:
+            fp.write("filename,actual,prediction\n")
+      else:
+        print("Inference with dummy data.")
+        input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3]
+        images = tf.random.uniform(input_shape, 0.0, 255.0, dtype=tf.float32, name='synthetic_images')
+
+    infer_graph = tf.Graph()
+    with infer_graph.as_default():
+      graph_def = tf.compat.v1.GraphDef()
+      with tf.compat.v1.gfile.FastGFile(self.args.input_graph, 'rb') as input_file:
+        input_graph_content = input_file.read()
+        graph_def.ParseFromString(input_graph_content)
+
+      output_graph = optimize_for_inference(graph_def, [INPUTS], 
+                              [OUTPUTS], dtypes.float32.as_datatype_enum, False)
+      input_shape = [self.args.batch_size, RESNET_IMAGE_SIZE, RESNET_IMAGE_SIZE, 3]
+      dummy_input = np.random.randn(*input_shape)
+      output_graph = optimize_for_benchmark(output_graph, tf.float32, dummy_input)
+
+      tf.import_graph_def(output_graph, name='')
+
+    # Definite input and output Tensors for detection_graph
+    input_tensor = infer_graph.get_tensor_by_name('input_tensor:0')
+    output_tensor = infer_graph.get_tensor_by_name('softmax_tensor:0')
+
+    data_sess  = tf.compat.v1.Session(graph=data_graph,  config=data_config)
+    infer_sess = tf.compat.v1.Session(graph=infer_graph, config=infer_config)
+
+    num_processed_images = 0
+    num_remaining_images = dataset.num_examples_per_epoch(subset=subset) - num_processed_images \
+        if self.args.data_location else datasets.IMAGENET_NUM_VAL_IMAGES
+
+    if (not self.args.accuracy_only):
+      iteration = 0
+      warm_up_iteration = self.args.warmup_steps
+      total_run = self.args.steps
+      total_time = 0
+
+      while num_remaining_images >= self.args.batch_size and iteration < total_run:
+        iteration += 1
+        tf_filenames = None
+        np_labels = None
+        data_load_start = time.time()
+        if self.args.results_file_path:
+          image_np, np_labels, tf_filenames = data_sess.run([images, labels, filenames])
+        else:
+          image_np = data_sess.run(images)
+
+        data_load_time = time.time() - data_load_start
+
+        num_processed_images += self.args.batch_size
+        num_remaining_images -= self.args.batch_size
+
+        start_time = time.time()
+        predictions = infer_sess.run(output_tensor)
+        time_consume = time.time() - start_time
+
+        # Write out the file name, expected label, and top prediction
+        self.write_results_output(predictions, tf_filenames, np_labels)
+
+        # only add data loading time for real data, not for dummy data
+        if self.args.data_location:
+          time_consume += data_load_time
+
+        print('Iteration %d: %.6f sec' % (iteration, time_consume))
+        if iteration > warm_up_iteration:
+          total_time += time_consume
+
+      time_average = total_time / (iteration - warm_up_iteration)
+      print('Average time: %.6f sec' % (time_average))
+
+      print('Batch size = %d' % self.args.batch_size)
+      if (self.args.batch_size == 1):
+        print('Latency: %.3f ms' % (time_average * 1000))
+      # print throughput for both batch size 1 and 128
+      print('Throughput: %.3f images/sec' % (self.args.batch_size / time_average))
+
+    else: # accuracy check
+      total_accuracy1, total_accuracy5 = (0.0, 0.0)
+
+      while num_remaining_images >= self.args.batch_size:
+        # Reads and preprocess data
+        tf_filenames = None
+        if self.args.results_file_path:
+          np_images, np_labels, tf_filenames = data_sess.run([images, labels, filenames])
+        else:
+          np_images, np_labels = data_sess.run([images, labels])
+        num_processed_images += self.args.batch_size
+        num_remaining_images -= self.args.batch_size
+
+        start_time = time.time()
+        # Compute inference on the preprocessed data
+        predictions = infer_sess.run(output_tensor,
+                               {input_tensor: np_images})
+        elapsed_time = time.time() - start_time
+
+        # Write out the file name, expected label, and top prediction
+        self.write_results_output(predictions, tf_filenames, np_labels)
+
+        with tf.Graph().as_default() as accu_graph:
+          accuracy1 = tf.reduce_sum(
+            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.constant(predictions),
+                                   targets=tf.constant(np_labels), k=1), tf.float32))
+
+          accuracy5 = tf.reduce_sum(
+            input_tensor=tf.cast(tf.nn.in_top_k(predictions=tf.constant(predictions),
+                targets=tf.constant(np_labels), k=5), tf.float32))
+          with tf.compat.v1.Session() as accu_sess:
+            np_accuracy1, np_accuracy5 = accu_sess.run([accuracy1, accuracy5])
+
+          total_accuracy1 += np_accuracy1
+          total_accuracy5 += np_accuracy5
+
+        print("Iteration time: %0.4f ms" % elapsed_time)
+        print("Processed %d images. (Top1 accuracy, Top5 accuracy) = (%0.4f, %0.4f)" \
+                  % (num_processed_images, total_accuracy1 / num_processed_images,
+                     total_accuracy5 / num_processed_images))
+
+  def validate_args(self):
+    """validate the arguments"""
+
+    if not self.args.data_location:
+      if self.args.accuracy_only:
+        raise ValueError("You must use real data for accuracy measurement.")
+
+
+if __name__ == "__main__":
+  evaluate_opt_graph = eval_classifier_optimized_graph()
+  evaluate_opt_graph.run()
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/optimize_for_benchmark.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/optimize_for_benchmark.py
new file mode 100644
index 000000000..ff65baaa8
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/optimize_for_benchmark.py
@@ -0,0 +1,72 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.tools.optimize_for_inference_lib import ensure_graph_is_valid
+
+import numpy as np
+
+def optimize_for_benchmark(input_graph_def, const_dtype, dummy_input):
+  ensure_graph_is_valid(input_graph_def)
+  optimized_graph_def = change_placehoder_to_const(input_graph_def, const_dtype, dummy_input)
+  ensure_graph_is_valid(input_graph_def)
+  return optimized_graph_def
+
+def change_placehoder_to_const(input_graph_def, const_dtype, dummy_input):
+  result_graph_def = graph_pb2.GraphDef()
+  for node in input_graph_def.node:
+    if node.op == 'Placeholder':
+      new_const = node_def_pb2.NodeDef()
+      new_const.op = 'Const'
+      new_const.name = node.name
+      new_const.attr["dtype"].CopyFrom(node.attr["dtype"])
+      tensor_proto = tensor_util.make_tensor_proto(dummy_input,
+                                                   const_dtype,
+                                                   dummy_input.shape)
+      new_const.attr["value"].tensor.CopyFrom(tensor_proto)
+      result_graph_def.node.extend([new_const])
+    else:
+      new_node = node_def_pb2.NodeDef()
+      new_node.CopyFrom(node)
+      retained_input = []
+      for input_node in new_node.input:
+        retained_input.append(input_node)
+      new_node.input[:] = retained_input
+
+      result_graph_def.node.extend([new_node])
+
+  return result_graph_def
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/preprocessing.py b/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/preprocessing.py
new file mode 100644
index 000000000..0228a1e36
--- /dev/null
+++ b/models/image_recognition/tensorflow/resnet50v1_5/inference/int8/preprocessing.py
@@ -0,0 +1,178 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import tensorflow as tf
+from tensorflow.python.data.experimental import parallel_interleave
+from tensorflow.python.data.experimental import map_and_batch
+from tensorflow.python.platform import gfile
+
+
+def parse_example_proto(example_serialized):
+  """Parses an Example proto containing a training example of an image.
+  """
+  # Dense features in Example proto.
+  feature_map = {
+    'image/encoded': tf.io.FixedLenFeature([], dtype=tf.string,
+                                        default_value=''),
+    'image/class/label': tf.io.FixedLenFeature([1], dtype=tf.int64,
+                                            default_value=-1),
+    'image/filename': tf.io.FixedLenFeature([], dtype=tf.string,
+                                         default_value="")
+  }
+  sparse_float32 = tf.io.VarLenFeature(dtype=tf.float32)
+  # Sparse features in Example proto.
+  feature_map.update(
+    {k: sparse_float32 for k in ['image/object/bbox/xmin',
+                                 'image/object/bbox/ymin',
+                                 'image/object/bbox/xmax',
+                                 'image/object/bbox/ymax']})
+
+  features = tf.io.parse_single_example(serialized=example_serialized, features=feature_map)
+  label = tf.cast(features['image/class/label'], dtype=tf.int32)
+  filename = tf.cast(features['image/filename'], dtype=tf.string)
+
+  return features['image/encoded'], label, filename
+
+
+def eval_image(image, height, width, resize_method,
+               central_fraction=0.875, scope=None):
+
+  with tf.compat.v1.name_scope('eval_image'):
+    if resize_method == 'crop':
+      shape = tf.shape(input=image)
+      image = tf.cond(pred=tf.less(shape[0], shape[1]),
+                      true_fn=lambda: tf.image.resize(image,
+                                                     tf.convert_to_tensor(value=[256, 256 * shape[1] / shape[0]],
+                                                                          dtype=tf.int32)),
+                      false_fn=lambda: tf.image.resize(image,
+                                                     tf.convert_to_tensor(value=[256 * shape[0] / shape[1], 256],
+                                                                          dtype=tf.int32)))
+
+      shape = tf.shape(input=image)
+      y0 = (shape[0] - height) // 2
+      x0 = (shape[1] - width) // 2
+      distorted_image = tf.image.crop_to_bounding_box(image, y0, x0, height, width)
+      distorted_image.set_shape([height, width, 3])
+      means = tf.broadcast_to([123.68, 116.78, 103.94], tf.shape(input=distorted_image))
+      return distorted_image - means
+    else:  # bilinear
+      if image.dtype != tf.float32:
+        image = tf.image.convert_image_dtype(image, dtype=tf.float32)
+      # Crop the central region of the image with an area containing 87.5% of
+      # the original image.
+      if central_fraction:
+        image = tf.image.central_crop(image, central_fraction=central_fraction)
+
+      if height and width:
+        # Resize the image to the specified height and width.
+        image = tf.expand_dims(image, 0)
+        image = tf.image.resize(image, [height, width],
+                                         method=tf.image.ResizeMethod.BILINEAR)
+        image = tf.squeeze(image, [0])
+      image = tf.subtract(image, 0.5)
+      image = tf.multiply(image, 2.0)
+      return image
+
+class RecordInputImagePreprocessor(object):
+  """Preprocessor for images with RecordInput format."""
+
+  def __init__(self,
+               height,
+               width,
+               batch_size,
+               num_cores,
+               resize_method="bilinear"):
+
+    self.height = height
+    self.width = width
+    self.batch_size = batch_size
+    self.num_cores = num_cores
+    self.resize_method = resize_method
+
+  def parse_and_preprocess(self, value):
+    # parse
+    image_buffer, label_index, filename = parse_example_proto(value)
+    # preprocess
+    image = tf.image.decode_jpeg(
+      image_buffer, channels=3, fancy_upscaling=False, dct_method='INTEGER_FAST')
+    image = eval_image(image, self.height, self.width, self.resize_method)
+    image, _ , _ = tf.quantization.quantize(image, -123.68, 151.06, tf.qint8)
+    return (image, label_index, filename)
+
+  def minibatch(self, dataset, subset, cache_data=False):
+
+    with tf.compat.v1.name_scope('batch_processing'):
+
+      glob_pattern = dataset.tf_record_pattern(subset)
+      file_names = gfile.Glob(glob_pattern)
+      if not file_names:
+        raise ValueError('Found no files in --data_dir matching: {}'
+                         .format(glob_pattern))
+      ds = tf.data.TFRecordDataset.list_files(file_names)
+
+      ds = ds.apply(
+        parallel_interleave(
+          tf.data.TFRecordDataset, cycle_length=self.num_cores, block_length=5,
+          sloppy=True,
+          buffer_output_elements=10000, prefetch_input_elements=10000))
+
+      if cache_data:
+        ds = ds.take(1).cache().repeat()
+
+      ds = ds.prefetch(buffer_size=10000)
+      #ds = ds.prefetch(buffer_size=self.batch_size)
+
+      # num of parallel batches not greater than 56
+      max_num_parallel_batches = min(56, 2 * self.num_cores)
+      ds = ds.apply(
+        map_and_batch(
+          map_func=self.parse_and_preprocess,
+          batch_size=self.batch_size,
+          num_parallel_batches=max_num_parallel_batches,
+          num_parallel_calls=None))
+
+      ds = ds.prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
+
+      ds_iterator = tf.compat.v1.data.make_one_shot_iterator(ds)
+      images, labels, filename = ds_iterator.get_next()
+      # reshape
+      labels = tf.reshape(labels, [self.batch_size])
+      filename = tf.reshape(filename, [self.batch_size])
+
+      return images, labels, filename
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/imagenet_main.py b/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/imagenet_main.py
index f8917b16e..f67b1c7fd 100644
--- a/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/imagenet_main.py
+++ b/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/imagenet_main.py
@@ -43,6 +43,7 @@
 
 import numpy.random
 import tensorflow as tf  # pylint: disable=g-bad-import-order
+tf.compat.v1.disable_eager_execution()
 
 from mlperf_compliance import mlperf_log
 from mlperf_resnet import imagenet_preprocessing
@@ -167,6 +168,7 @@ def parse_record(raw_record, is_training, dtype):
 
 
 def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_gpus=None,
+             datasets_num_private_threads=None,
              dtype=tf.float32):
   """Input function which provides batches for train or eval.
 
@@ -192,6 +194,11 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_gpus=None,
   # Convert to individual records
   dataset = dataset.flat_map(tf.data.TFRecordDataset)
 
+  if datasets_num_private_threads:
+    options = tf.data.Options()
+    options.experimental_threading.private_threadpool_size = datasets_num_private_threads
+    dataset = dataset.with_options(options)
+
   return resnet_run_loop.process_record_dataset(
       dataset=dataset,
       is_training=is_training,
@@ -205,9 +212,9 @@ def input_fn(is_training, data_dir, batch_size, num_epochs=1, num_gpus=None,
   )
 
 
-def get_synth_input_fn():
+def get_synth_input_fn(use_bfloat16):
   return resnet_run_loop.get_synth_input_fn(
-      _DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS, _NUM_CLASSES)
+      _DEFAULT_IMAGE_SIZE, _DEFAULT_IMAGE_SIZE, _NUM_CHANNELS, _NUM_CLASSES, use_bfloat16)
 
 
 ###############################################################################
@@ -356,7 +363,7 @@ def main(argv):
                           value=_NUM_IMAGES['train'])
   mlperf_log.resnet_print(key=mlperf_log.PREPROC_NUM_EVAL_EXAMPLES,
                           value=_NUM_IMAGES['validation'])
-  input_function = flags.use_synthetic_data and get_synth_input_fn() or input_fn
+  input_function = flags.use_synthetic_data and get_synth_input_fn(flags.use_bfloat16) or input_fn
 
   resnet_run_loop.resnet_main(seed,
       flags, imagenet_model_fn, input_function,
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/resnet_model.py b/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/resnet_model.py
index 68922ceb8..c23d8fb1e 100644
--- a/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/resnet_model.py
+++ b/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/resnet_model.py
@@ -189,7 +189,7 @@ def _bottleneck_block_v1(inputs, filters, training, projection_shortcut,
   mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_SHORTCUT_ADD)
   # TODO(nhasabni): temporarily replacing Add by AddN for performance.
   # Remove it later once we optimize this in graph.
-  inputs = tf.math.add_n([inputs, shortcut])
+  inputs = tf.math.add(inputs, shortcut)
 
   mlperf_log.resnet_print(key=mlperf_log.MODEL_HP_RELU)
   inputs = tf.nn.relu(inputs)
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/resnet_run_loop.py b/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/resnet_run_loop.py
index 2e8404b20..bc1db34fb 100644
--- a/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/resnet_run_loop.py
+++ b/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_resnet/resnet_run_loop.py
@@ -136,7 +136,7 @@ def process_record_dataset(dataset, is_training, batch_size, shuffle_buffer,
   return dataset
 
 
-def get_synth_input_fn(height, width, num_channels, num_classes):
+def get_synth_input_fn(height, width, num_channels, num_classes, use_bfloat16):
   """Returns an input function that returns a dataset with zeroes.
 
   This is useful in debugging input pipeline performance, as it removes all
@@ -154,8 +154,11 @@ def get_synth_input_fn(height, width, num_channels, num_classes):
     that can be used for iteration.
   """
   def input_fn(is_training, data_dir, batch_size, *args, **kwargs):  # pylint: disable=unused-argument
-    images = tf.zeros((batch_size, height, width, num_channels), tf.float32)
-    labels = tf.zeros((batch_size, num_classes), tf.int32)
+    if use_bfloat16:
+      images = tf.zeros((batch_size, height, width, num_channels), tf.bfloat16)
+    else:
+      images = tf.zeros((batch_size, height, width, num_channels), tf.float32)
+    labels = tf.zeros((batch_size), tf.int32)
     return tf.data.Dataset.from_tensors((images, labels)).repeat()
 
   return input_fn
@@ -196,11 +199,11 @@ def learning_rate_with_decay(
 
   def learning_rate_fn(global_step):
     lr = tf.compat.v1.train.piecewise_constant(global_step, boundaries, vals)
-    warmup_steps = int(batches_per_epoch * 5)
+    warmup_steps = batches_per_epoch * 5
+    global_step_f32 = tf.cast(global_step, tf.float32)
     warmup_lr = (
-        initial_learning_rate * tf.cast(global_step, tf.float32) / tf.cast(
-        warmup_steps, tf.float32))
-    return tf.cond(pred=global_step < warmup_steps, true_fn=lambda: warmup_lr, false_fn=lambda: lr)
+        initial_learning_rate * global_step_f32 / warmup_steps)
+    return tf.cond(pred=global_step_f32 < warmup_steps, true_fn=lambda: warmup_lr, false_fn=lambda: lr)
 
   def poly_rate_fn(global_step):
     """Handles linear scaling rule, gradual warmup, and LR decay.
@@ -300,16 +303,14 @@ def resnet_model_fn(features, labels, mode, model_class,
     current mode.
   """
 
-  # Generate a summary node for the images
-  tf.compat.v1.summary.image('images', features, max_outputs=6)
-
-  # Checks that features/images have same data type being used for calculations.
-  assert features.dtype == dtype
+  # Generate a summary node for the images, this API does not support bf16
+  if not use_bfloat16:
+   tf.compat.v1.summary.image('images', features, max_outputs=6)
 
   if use_bfloat16 == True:
     dtype = tf.bfloat16
-
-  features = tf.cast(features, dtype)
+  else:
+    features = tf.cast(features, dtype)
 
   model = model_class(resnet_size, data_format, version=version, dtype=dtype)
 
@@ -401,7 +402,7 @@ def exclude_batch_norm(name):
           momentum=momentum
       )
     if is_mpi:
-      optimizer = hvd.DistributedOptimizer(optimizer)
+      optimizer = hvd.DistributedOptimizer(optimizer, num_groups=1)
 
     if use_float16 and loss_scale == 1:
       optimizer = tf.compat.v1.train.experimental.enable_mixed_precision_graph_rewrite(optimizer)
@@ -414,7 +415,7 @@ def exclude_batch_norm(name):
       # When computing fp16 gradients, often intermediate tensor values are
       # so small, they underflow to 0. To avoid this, we multiply the loss by
       # loss_scale to make these tensor values loss_scale times bigger.
-      scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale)
+      scaled_grad_vars = optimizer.compute_gradients(loss * loss_scale, gate_gradients=tf.compat.v1.train.Optimizer.GATE_NONE)
 
       # Once the gradient computation is complete we can scale the gradients
       # back to the correct scale before passing them to the optimizer.
@@ -422,7 +423,7 @@ def exclude_batch_norm(name):
                             for grad, var in scaled_grad_vars]
       minimize_op = optimizer.apply_gradients(unscaled_grad_vars, global_step)
     else:
-      minimize_op = optimizer.minimize(loss, global_step)
+      minimize_op = optimizer.minimize(loss, global_step, gate_gradients=tf.compat.v1.train.Optimizer.GATE_NONE)
 
     update_ops = tf.compat.v1.get_collection(tf.compat.v1.GraphKeys.UPDATE_OPS)
     train_op = tf.group(minimize_op, update_ops, num_examples_metric[1])
@@ -514,25 +515,23 @@ def resnet_main(seed, flags, model_function, input_function, shape=None):
       allow_soft_placement=True)
   session_config.graph_options.rewrite_options.remapping = (
           rewriter_config_pb2.RewriterConfig.AGGRESSIVE)
+  if is_mpi:
+    gpus = tf.config.experimental.list_physical_devices('XPU')
+    for gpu in gpus:
+        tf.config.experimental.set_memory_growth(gpu, True)
+    if gpus:
+        tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'XPU')
+    session_config.gpu_options.visible_device_list = str(hvd.local_rank())
   if flags.use_float16:
     session_config.graph_options.rewrite_options.auto_mixed_precision = (
               rewriter_config_pb2.RewriterConfig.ON)
 
-
-
   if flags.num_gpus == 0:
     distribution = tf.distribute.OneDeviceStrategy('device:CPU:0')
-  elif flags.num_gpus == 1:
-    distribution = tf.distribute.OneDeviceStrategy('device:GPU:0')
-  else:
-    distribution = tf.distribute.MirroredStrategy(
-        num_gpus=flags.num_gpus
-    )
 
   mlperf_log.resnet_print(key=mlperf_log.RUN_SET_RANDOM_SEED, value=seed)
-  run_config = tf.estimator.RunConfig(train_distribute=distribution,
-                                      session_config=session_config,
-                                      log_step_count_steps=10, # output logs more frequently
+  run_config = tf.estimator.RunConfig(session_config=session_config,
+                                      log_step_count_steps=1, # output logs more frequently
                                       tf_random_seed=seed)
 
   mlperf_log.resnet_print(key=mlperf_log.INPUT_BATCH_SIZE,
@@ -629,6 +628,7 @@ def input_fn_train():
           batch_size=per_device_batch_size(flags.batch_size, flags.num_gpus),
           num_epochs=flags.epochs_between_evals,
           num_gpus=flags.num_gpus,
+          datasets_num_private_threads=flags.intra_op_parallelism_threads,
           dtype=flags.dtype
       )
     if is_mpi:
diff --git a/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_utils/logs/hooks_helper.py b/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_utils/logs/hooks_helper.py
index e92dc5d23..83007670b 100644
--- a/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_utils/logs/hooks_helper.py
+++ b/models/image_recognition/tensorflow/resnet50v1_5/training/mlperf_utils/logs/hooks_helper.py
@@ -33,7 +33,6 @@
                                         'cross_entropy',
                                         'train_accuracy'])
 
-
 def get_train_hooks(name_list, **kwargs):
   """Factory for getting a list of TensorFlow hooks for training by name.
 
diff --git a/models/language_modeling/pytorch/bert_large/inference/gpu/.gitignore b/models/language_modeling/pytorch/bert_large/inference/gpu/.gitignore
new file mode 100644
index 000000000..c769faa86
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/inference/gpu/.gitignore
@@ -0,0 +1,2 @@
+huggingface/
+*.tar.gz
diff --git a/models/language_modeling/pytorch/bert_large/inference/gpu/cmd.sh b/models/language_modeling/pytorch/bert_large/inference/gpu/cmd.sh
new file mode 100755
index 000000000..0b9aa2c68
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/inference/gpu/cmd.sh
@@ -0,0 +1,70 @@
+#!/bin/bash
+#
+# Copyright (c) 2022-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+NUMA_ARGS=""
+if command -v numactl >& /dev/null ; then
+if [ "x$MPI_LOCALRANKID" != "x" ] ; then
+  REAL_NUM_NUMA_NODES=`lscpu | grep "NUMA node(s):" | awk '{print $NF}'`
+  PPNUMA=$(( MPI_LOCALNRANKS / REAL_NUM_NUMA_NODES ))
+  if [ $PPNUMA -eq 0 ] ; then 
+    if [ "x$SINGLE_SOCKET_ONLY" == "x1" ] ; then 
+      NUMA_ARGS="numactl -m 0 "
+    fi
+  else
+    NUMARANK=$(( MPI_LOCALRANKID / PPNUMA ))
+    NUMA_ARGS="$NUMA_ARGS $GDB_ARGS "
+  fi
+  NUM_RANKS=$PMI_SIZE
+else
+  NUMA_ARGS="numactl -m 0 "
+  NUM_RANKS=1
+fi
+fi
+
+if [ "x$1" == "x-gdb" ] ; then
+GDB_ARGS="gdb --args "
+shift
+else
+GDB_ARGS=""
+fi
+
+# set dataset
+if test -z $dataset || ! test -d $dataset ; then
+  if test -d ./SQUAD1 ; then
+    dataset=./SQUAD1
+  else
+    echo "Unable to find SQUAD dataset path"
+    exit 1
+  fi
+fi
+
+$NUMA_ARGS $GDB_ARGS python -u run_squad.py \
+  --model_type bert \
+  --model_name_or_path bert-large-uncased-whole-word-masking \
+  --do_train \
+  --do_eval \
+  --do_lower_case \
+  --train_file $dataset/train-v1.1.json \
+  --predict_file $dataset/dev-v1.1.json \
+  --per_gpu_train_batch_size 24 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2.0 \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --output_dir /tmp/debug_squad/ \
+  $@
+
diff --git a/models/language_modeling/pytorch/bert_large/inference/gpu/cmd_infer.sh b/models/language_modeling/pytorch/bert_large/inference/gpu/cmd_infer.sh
new file mode 100755
index 000000000..756c86030
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/inference/gpu/cmd_infer.sh
@@ -0,0 +1,139 @@
+#!/bin/bash
+#
+# Copyright (c) 2022-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+function Parser() {
+    while [ $# -ne 0 ]; do
+        case $1 in
+            -m)
+                shift
+                MODEL="$1"
+                ;;
+            -d)
+                shift
+                DEVICE="$1"
+                ;;
+            -b)
+                shift
+                BATCH="$1"
+                ;;
+            -t)
+                shift
+                DTYPE="$1"
+                ;;
+            -n)
+                shift
+                if [ $1 -gt 0 ];then
+                   NUM_ITER="$1"
+                fi
+                ;;
+            -o)
+                shift
+                OUTPUT_DIR="$1"
+                ;;
+            -g)
+                GDB_ARGS="gdb --args "
+                ;;
+            -h | --help)
+                echo "Usage: cmd_infer.sh [OPTION...] PAGE..."
+                echo "-m, Optional    Specify the model type[bert_base or bert_large]. The default value is bert_base"
+                echo "-d, Optional    Specify the device[cpu, xpu]. The default device is cpu"
+                echo "-b, Optional    Specify the batch size. The default value is 32"
+                echo "-t, Optional    Specify the dtype[FP32, FP16...]. The default value is FP32"
+                echo "-n, Optional    Specify the number of iterations to run evaluation"
+                echo "-o, Optional    Specify the output dir. The default value is /tmp/debug_squad/"
+                echo "-g, Optional    use gdb"
+                exit
+                ;;
+            --*|-*)
+                echo ">>> New param: <$1>"
+                ;;
+            *)
+                echo ">>> Parsing mismatch: $1"
+                ;;
+        esac
+        shift
+    done
+}
+
+MODEL="bert_base"
+DEVICE=cpu
+BATCH=32
+DTYPE=FP32
+NUM_ITER=-1
+OUTPUT_DIR=/tmp/debug_squad/
+GDB_ARGS=""
+NUMA_ARGS=""
+
+Parser $@
+
+if command -v numactl >& /dev/null ; then
+if [ "x$MPI_LOCALRANKID" != "x" ] ; then
+  REAL_NUM_NUMA_NODES=`lscpu | grep "NUMA node(s):" | awk '{print $NF}'`
+  PPNUMA=$(( MPI_LOCALNRANKS / REAL_NUM_NUMA_NODES ))
+  if [ $PPNUMA -eq 0 ] ; then 
+    if [ "x$SINGLE_SOCKET_ONLY" == "x1" ] ; then 
+      NUMA_ARGS="numactl -m 0 "
+    fi
+  else
+    NUMARANK=$(( MPI_LOCALRANKID / PPNUMA ))
+    NUMA_ARGS="$NUMA_ARGS $GDB_ARGS "
+  fi
+  NUM_RANKS=$PMI_SIZE
+else
+  NUMA_ARGS="numactl -m 0 "
+  NUM_RANKS=1
+fi
+fi
+
+# set dataset and model_path
+if test -z $dataset || ! test -d $dataset ; then
+  dataset=$DATASET_DIR
+fi
+
+if [ "$MODEL" == "bert_base" ] ; then
+  if test -d ./squad_base_finetuned_checkpoint ; then
+    :
+  else
+    ./download_squad_base_fine_tuned_model.sh
+  fi
+  model_path=./squad_base_finetuned_checkpoint
+elif [ "$MODEL" == "bert_large" ] ; then
+  if test -d ./squad_large_finetuned_checkpoint ; then
+    :
+  else
+    ./download_squad_large_fine_tuned_model.sh
+  fi
+  model_path=./squad_large_finetuned_checkpoint 
+else
+  echo "The modle (${MODEL}) does not exist."
+  exit
+fi
+
+$NUMA_RAGS $GDB_ARGS python -u run_squad.py \
+  --model_type bert \
+  --model_name_or_path $model_path \
+  --do_eval \
+  --do_lower_case \
+  --do_jit \
+  --device_choice ${DEVICE} \
+  --dtype ${DTYPE}    \
+  --predict_file $dataset/dev-v1.1.json \
+  --per_gpu_eval_batch_size ${BATCH} \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --num_iterations ${NUM_ITER} \
+  --output_dir ${OUTPUT_DIR}
diff --git a/models/language_modeling/pytorch/bert_large/inference/gpu/download_squad_large_fine_tuned_model.sh b/models/language_modeling/pytorch/bert_large/inference/gpu/download_squad_large_fine_tuned_model.sh
new file mode 100755
index 000000000..d60d97cd9
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/inference/gpu/download_squad_large_fine_tuned_model.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+#
+# Copyright (c) 2022-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/tree/main
+#mkdir bert-large-uncased-whole-word-masking-finetuned-squad
+set -e
+mkdir squad_large_finetuned_checkpoint && cd squad_large_finetuned_checkpoint
+wget -c https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/config.json
+wget -c https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/pytorch_model.bin
+wget -c https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/tokenizer.json
+wget -c https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/tokenizer_config.json
+wget -c https://huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad/resolve/main/vocab.txt
+
diff --git a/models/language_modeling/pytorch/bert_large/inference/gpu/requirements.txt b/models/language_modeling/pytorch/bert_large/inference/gpu/requirements.txt
new file mode 100644
index 000000000..967e937d1
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/inference/gpu/requirements.txt
@@ -0,0 +1,2 @@
+tensorboardX
+transformers==4.25.1
diff --git a/models/language_modeling/pytorch/bert_large/inference/gpu/run_squad.py b/models/language_modeling/pytorch/bert_large/inference/gpu/run_squad.py
new file mode 100644
index 000000000..d5ed19a07
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/inference/gpu/run_squad.py
@@ -0,0 +1,1411 @@
+# coding=utf-8
+# Copyright (c) 2022-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
+
+
+import argparse
+import glob
+import logging
+import os
+import random
+import timeit
+import time
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+
+import intel_extension_for_pytorch
+import transformers
+from transformers import (
+    MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+    WEIGHTS_NAME,
+    AdamW,
+    AutoConfig,
+    AutoModelForQuestionAnswering,
+    AutoTokenizer,
+    get_linear_schedule_with_warmup,
+    squad_convert_examples_to_features,
+)
+from transformers.data.metrics.squad_metrics import (
+    compute_predictions_log_probs,
+    compute_predictions_logits,
+    squad_evaluate,
+)
+
+from transformers.modeling_outputs import (
+    QuestionAnsweringModelOutput,
+)
+
+from transformers.data.processors.squad import (
+    SquadResult,
+    SquadV1Processor,
+    SquadV2Processor,
+)
+from transformers.trainer_utils import is_main_process
+
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except ImportError:
+    from tensorboardX import SummaryWriter
+
+logger = logging.getLogger(__name__)
+
+MODEL_CONFIG_CLASSES = list(MODEL_FOR_QUESTION_ANSWERING_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+MAP_TORCH_DTYPE = {"FP32": torch.float32, "FP16": torch.float16,
+                   "BF16": torch.bfloat16, "FP64": torch.float64,
+                   "CF64": torch.cfloat, "CF128": torch.cdouble,
+                   "UINT8": torch.uint8, "INT8": torch.int8,
+                   "INT16": torch.int16, "INT": torch.int32,
+                   "INT64": torch.int64, "BOOL": torch.bool,
+                  }
+
+hub = os.path.expanduser("~/.cache/torch/intel")
+if not os.path.exists(hub):
+    os.makedirs(hub)
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.use_pcl:
+        pcl_bert.set_rnd_seed(args.seed)
+    if (args.n_gpu > 0) and (args.device_choice == "cuda"):
+        torch.cuda.manual_seed_all(args.seed)
+    elif (args.n_gpu > 0) and (args.device_choice == "xpu"):
+        torch.xpu.manual_seed_all(args.seed)
+
+
+def to_list(tensor):
+    return tensor.detach().cpu().tolist()
+
+
+def load_model(args, config = None, checkpoint = None):
+    model = None
+    if checkpoint and args.use_pcl:
+        with pcl_bert.pcl_impl(args.use_pcl, args.pcl_bf16, args.unpad):
+            model = AutoModelForQuestionAnswering.from_pretrained(
+                checkpoint
+            )  # , force_download=True)
+
+    elif checkpoint and (not args.use_pcl):
+        model = AutoModelForQuestionAnswering.from_pretrained(
+            checkpoint
+        )  # , force_download=True)
+
+    elif (not checkpoint) and args.use_pcl:
+        with pcl_bert.pcl_impl(args.use_pcl, args.pcl_bf16, args.unpad):
+            model = AutoModelForQuestionAnswering.from_pretrained(
+                args.model_name_or_path,
+                from_tf=bool(".ckpt" in args.model_name_or_path),
+                config=config,
+                cache_dir=args.cache_dir if args.cache_dir else None,
+            )
+    else:
+        model = AutoModelForQuestionAnswering.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+            cache_dir=args.cache_dir if args.cache_dir else None,
+        )
+    return model
+
+def load_jit_model(model, inputs, dtype, device, jit_trace_path):
+    jit_model = None
+    if (os.path.isfile(jit_trace_path)):
+        print("load trace model ...")
+        jit_model = torch.jit.load(jit_trace_path)
+        print("load trace model done")
+    else:
+        print("create trace model")
+        in_1 = torch.unsqueeze(inputs["input_ids"][0].clone(), 0)
+        in_2 = torch.unsqueeze(inputs["token_type_ids"][0].clone(), 0)
+        in_3 = torch.unsqueeze(inputs["attention_mask"][0].clone(), 0)
+        with torch.xpu.amp.autocast(enabled=True, dtype=dtype, cache_enabled=False):
+            jit_model = torch.jit.trace(model,
+                (in_1.to(device),
+                in_2.to(device),
+                in_3.to(device)),
+                strict = False)
+        jit_model.save(jit_trace_path)
+
+    return jit_model
+
+def mkdir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
+
+
+def collate_fn_(batch, device=None, dtype=None):
+    for key, value in batch.items():
+        if device:
+            batch[key] = value.to(device)
+        if ((isinstance(value, torch.FloatTensor)\
+            or isinstance(value, torch.cuda.FloatTensor)\
+            or isinstance(value, torch.xpu.FloatTensor))\
+            and (dtype != None)\
+            and (dtype != "FP32")):
+            batch[key] = value.to(MAP_TORCH_DTYPE[dtype])
+    return batch
+
+def model_cast(model, device=None, dtype=None):
+    # Set device type
+    if (device):
+        model.to(device)
+
+    # Set data type
+    if (dtype == None):
+        pass
+    elif (dtype == "FP16"):
+        model.half()
+    elif (dtype == "BF16"):
+        model.bfloat16()
+    elif (dtype == "FP64"):
+        model.double()
+    elif (dtype == "FP32"):
+        model.float()
+    else:
+        logger.error("The datatype for model casting not yet supported by pytorch")
+
+def get_device(device_choice):
+    if device_choice == "xpu":
+        device = torch.device(
+            "xpu" if torch.xpu.is_available() else "cpu"
+        )
+        n_gpu = 1 if torch.xpu.is_available() else 0
+    elif device_choice == "cuda":
+        device = torch.device(
+            "cuda" if torch.cuda.is_available() else "cpu"
+        )
+        n_gpu = torch.cuda.device_count() if torch.cuda.is_available() else 0
+    else :
+        device = torch.device("cpu")
+        n_gpu = 0
+    return device, n_gpu
+
+
+def train(args, train_dataset, model, tokenizer):
+    """Train the model"""
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = (
+        RandomSampler(train_dataset)
+        if args.local_rank == -1
+        else DistributedSampler(train_dataset)
+    )
+    train_dataloader = DataLoader(
+        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size
+    )
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = (
+            args.max_steps
+            // (len(train_dataloader) // args.gradient_accumulation_steps)
+            + 1
+        )
+    else:
+        t_total = (
+            len(train_dataloader)
+            // args.gradient_accumulation_steps
+            * args.num_train_epochs
+        )
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
+    ]
+    if args.use_pcl:
+        optimizer = pcl_bert.AdamW(
+            optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon
+        )
+    else:
+        optimizer = AdamW(
+            optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon
+        )
+    scheduler = get_linear_schedule_with_warmup(
+        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
+    )
+
+    # Check if saved optimizer or scheduler states exist
+    if os.path.isfile(
+        os.path.join(args.model_name_or_path, "optimizer.pt")
+    ) and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt")):
+        # Load in optimizer and scheduler states
+        optimizer.load_state_dict(
+            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt"))
+        )
+        scheduler.load_state_dict(
+            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt"))
+        )
+
+    if (args.dtype == "FP16") and (args.device_choice == "cuda"):
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError(
+                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
+            )
+
+        model, optimizer = amp.initialize(
+            model, optimizer, opt_level=args.fp16_opt_level
+        )
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        if args.n_gpu == 0:
+            model = torch.nn.parallel.DistributedDataParallel(
+                model, find_unused_parameters=True
+            )
+        else:
+            model = torch.nn.parallel.DistributedDataParallel(
+                model,
+                device_ids=[args.local_rank],
+                output_device=args.local_rank,
+                find_unused_parameters=True,
+            )
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info(
+        "  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
+    )
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 1
+    epochs_trained = 0
+    steps_trained_in_current_epoch = 0
+    # Check if continuing training from a checkpoint
+    if os.path.exists(args.model_name_or_path):
+        try:
+            # set global_step to gobal_step of last saved checkpoint from model path
+            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
+            global_step = int(checkpoint_suffix)
+            epochs_trained = global_step // (
+                len(train_dataloader) // args.gradient_accumulation_steps
+            )
+            steps_trained_in_current_epoch = global_step % (
+                len(train_dataloader) // args.gradient_accumulation_steps
+            )
+
+            logger.info(
+                "  Continuing training from checkpoint, will skip to saved global_step"
+            )
+            logger.info("  Continuing training from epoch %d", epochs_trained)
+            logger.info("  Continuing training from global step %d", global_step)
+            logger.info(
+                "  Will skip the first %d steps in the first epoch",
+                steps_trained_in_current_epoch,
+            )
+        except ValueError:
+            logger.info("  Starting fine-tuning.")
+
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(
+        epochs_trained,
+        int(args.num_train_epochs),
+        desc="Epoch",
+        disable=args.local_rank not in [-1, 0],
+    )
+    # Added here for reproductibility
+    set_seed(args)
+
+    for _ in train_iterator:
+        epoch_iterator = tqdm(
+            train_dataloader, desc="Iteration", disable=True
+        )  # args.local_rank not in [-1, 0])
+        # epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        end_time = timeit.default_timer()
+        for step, batch in enumerate(epoch_iterator):
+            record_shapes = True
+            with torch.autograd.profiler.profile(
+                enabled=args.profile,
+                use_cuda=(args.n_gpu > 0),
+                record_shapes=record_shapes,
+            ) as prof:
+                # Skip past any already trained steps if resuming training
+                if steps_trained_in_current_epoch > 0:
+                    steps_trained_in_current_epoch -= 1
+                    continue
+
+                if prof and args.use_pcl:
+                    pcl_bert.reset_debug_timers()
+                start_fwd_time = timeit.default_timer()
+                model.train()
+                batch = tuple(t.to(args.device) for t in batch)
+
+                inputs = {
+                    "input_ids": batch[0],
+                    "attention_mask": batch[1],
+                    "token_type_ids": batch[2],
+                    "start_positions": batch[3],
+                    "end_positions": batch[4],
+                }
+
+                if args.model_type in [
+                    "xlm",
+                    "roberta",
+                    "distilbert",
+                    "camembert",
+                    "bart",
+                    "longformer",
+                ]:
+                    del inputs["token_type_ids"]
+
+                if args.model_type in ["xlnet", "xlm"]:
+                    inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
+                    if args.version_2_with_negative:
+                        inputs.update({"is_impossible": batch[7]})
+                    if hasattr(model, "config") and hasattr(model.config, "lang2id"):
+                        inputs.update(
+                            {
+                                "langs": (
+                                    torch.ones(batch[0].shape, dtype=torch.int64)
+                                    * args.lang_id
+                                ).to(args.device)
+                            }
+                        )
+
+                outputs = model(**inputs)
+                # model outputs are always tuple in transformers (see doc)
+                loss = outputs[0]
+
+                if args.n_gpu > 1:
+                    loss = (
+                        loss.mean()
+                    )  # mean() to average on multi-gpu parallel (not distributed) training
+                if args.gradient_accumulation_steps > 1:
+                    loss = loss / args.gradient_accumulation_steps
+
+                start_bwd_time = timeit.default_timer()
+                if args.dtype == "FP16":
+                    with amp.scale_loss(loss, optimizer) as scaled_loss:
+                        scaled_loss.backward()
+                else:
+                    loss.backward()
+
+                # tr_loss += loss.item()
+                step_loss = loss.item()
+                tr_loss += step_loss
+                start_opt_time = timeit.default_timer()
+                if (step + 1) % args.gradient_accumulation_steps == 0:
+                    with torch.autograd.profiler.record_function("clip_grad_norm"):
+                        if args.dtype == "FP16":
+                            torch.nn.utils.clip_grad_norm_(
+                                amp.master_params(optimizer), args.max_grad_norm
+                            )
+                        elif args.use_pcl and args.pcl_bf16:
+                            pcl_bert.clip_grad_norm_(
+                                model.parameters(), args.max_grad_norm
+                            )
+                        else:
+                            torch.nn.utils.clip_grad_norm_(
+                                model.parameters(), args.max_grad_norm
+                            )
+
+                    with torch.autograd.profiler.record_function("optimizer"):
+                        optimizer.step()
+                        scheduler.step()  # Update learning rate schedule
+                        # model.zero_grad()
+                        for p in model.parameters():
+                            p.grad = None
+                    global_step += 1
+
+                    # Log metrics
+                    if (
+                        args.local_rank in [-1, 0]
+                        and args.logging_steps > 0
+                        and global_step % args.logging_steps == 0
+                    ):
+                        # Only evaluate when single GPU otherwise metrics may not average well
+                        if args.local_rank == -1 and args.evaluate_during_training:
+                            results = evaluate(args, model, tokenizer)
+                            for key, value in results.items():
+                                tb_writer.add_scalar(
+                                    "eval_{}".format(key), value, global_step
+                                )
+                        tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                        tb_writer.add_scalar(
+                            "loss",
+                            (tr_loss - logging_loss) / args.logging_steps,
+                            global_step,
+                        )
+                        logging_loss = tr_loss
+
+                    # Save model checkpoint
+                    if (
+                        args.local_rank in [-1, 0]
+                        and args.save_steps > 0
+                        and global_step % args.save_steps == 0
+                    ):
+                        output_dir = os.path.join(
+                            args.output_dir, "checkpoint-{}".format(global_step)
+                        )
+                        # Take care of distributed/parallel training
+                        model_to_save = (
+                            model.module if hasattr(model, "module") else model
+                        )
+                        model_to_save.save_pretrained(output_dir)
+                        tokenizer.save_pretrained(output_dir)
+
+                        torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                        logger.info("Saving model checkpoint to %s", output_dir)
+
+                        torch.save(
+                            optimizer.state_dict(),
+                            os.path.join(output_dir, "optimizer.pt"),
+                        )
+                        torch.save(
+                            scheduler.state_dict(),
+                            os.path.join(output_dir, "scheduler.pt"),
+                        )
+                        logger.info(
+                            "Saving optimizer and scheduler states to %s", output_dir
+                        )
+
+                data_time = start_fwd_time - end_time
+                end_time = timeit.default_timer()
+                if args.local_rank in [-1, 0]:
+                    print(
+                        f"Step: {global_step-1}, loss: {step_loss:6g}  tr_loss: {tr_loss/(global_step-1):6g} DT: {data_time*1e3:6g} FT: {(start_bwd_time-start_fwd_time)*1e3:6g} BT: {(start_opt_time-start_bwd_time)*1e3:6g} OT: {(end_time-start_opt_time)*1e3:6g} TT: {(end_time-start_fwd_time+data_time)*1e3:6g}"
+                    )
+                if prof and args.use_pcl:
+                    pcl_bert.print_debug_timers()
+                if args.max_steps > 0 and global_step > args.max_steps:
+                    epoch_iterator.close()
+                    break
+            if prof:
+                file_prefix = "squad_time%s" % (
+                    "_r%d" % args.local_rank if args.local_rank >= 0 else ""
+                )
+                with open("%s.prof" % file_prefix, "w") as prof_f:
+                    prof_f.write(
+                        prof.key_averages(group_by_input_shape=record_shapes).table(
+                            sort_by="cpu_time_total"
+                        )
+                    )
+                try:
+                    with open("%s.nested.prof" % file_prefix, "w") as prof_f:
+                        # prof_f.write(prof.nested_key_averages().table(sort_by="cpu_time_total"))
+                        prof_f.write(
+                            prof.nested_key_averages().table(
+                                sort_by=None, row_limit=1000
+                            )
+                        )
+                    with open("%s.top_level.prof" % file_prefix, "w") as prof_f:
+                        prof_f.write(
+                            prof.nested_key_averages(only_top_level=True).table(
+                                sort_by="cpu_time_total"
+                            )
+                        )
+                    prof.print_op_timings(prof, prefix=file_prefix)
+                except:
+                    pass
+                end_time = timeit.default_timer()
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    model_cast(model, device=args.device)
+    model.eval()
+
+    dataset, examples, features = load_and_cache_examples(
+        args, tokenizer, evaluate=True, output_examples=True
+    )
+
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(dataset)
+    eval_dataloader = DataLoader(
+        dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
+    )
+
+    # multi-gpu evaluate
+    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
+        model = torch.nn.DataParallel(model)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+
+    all_results = []
+
+    jit_model = None
+    jit_trace_path = os.path.join(hub, args.model_type + "_" + args.model_name_or_path.split("_")[1] + "_trace_" + args.dtype + ".zip")
+
+    do_profiling = os.environ.get("PROFILE", "OFF").upper() in ["1", "Y", "ON", "YES", "TRUE"]
+
+    start_time = time.time()
+    iter_num = 0
+    time_collect = []
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        inputs = {
+            "input_ids": batch[0],
+            "attention_mask": batch[1],
+            "token_type_ids": batch[2],
+        }
+
+        if args.model_type in [
+            "xlm",
+            "roberta",
+            "distilbert",
+            "camembert",
+            "bart",
+            "longformer",
+        ]:
+            del inputs["token_type_ids"]
+
+        feature_indices = batch[3]
+
+        # XLNet and XLM use more arguments for their predictions
+        if args.model_type in ["xlnet", "xlm"]:
+            inputs.update({"cls_index": batch[4], "p_mask": batch[5]})
+            # for lang_id-sensitive xlm models
+            if hasattr(model, "config") and hasattr(model.config, "lang2id"):
+                inputs.update(
+                    {
+                        "langs": (
+                            torch.ones(batch[0].shape, dtype=torch.int64)
+                            * args.lang_id
+                        ).to(args.device)
+                    }
+                )
+
+        if iter_num == 0:
+           model = torch.xpu.optimize(model=model, dtype=MAP_TORCH_DTYPE[args.dtype], level="O1", weights_prepack = False)
+           if args.do_jit:
+               jit_model = load_jit_model(model, inputs, MAP_TORCH_DTYPE[args.dtype], args.device, jit_trace_path)
+        with torch.no_grad():
+            outputs = None
+            batch_start = None
+            batch_end = None
+            with torch.autograd.profiler_legacy.profile(do_profiling, use_xpu=True, record_shapes=False) as prof:
+                batch_start = time.time()
+                inputs = collate_fn_(inputs, device = args.device)
+                if args.do_jit:
+                    outputs = jit_model(**inputs)
+                else:
+                    outputs = model(**inputs)
+                # torch.xpu.synchronize() does not take effect
+                # torch.xpu.synchronize()
+                for k, v in outputs.items():
+                    v = v.to(torch.float32).to("cpu")
+                batch_end = time.time()
+            print("local latency is:", (batch_end - batch_start), ' s')
+            print("local throughput is:", args.eval_batch_size/(batch_end - batch_start), ' sentences/s')
+            if iter_num >= 10 and iter_num <= (len(dataset)/args.eval_batch_size - 10):
+                time_collect.append(batch_end - batch_start)
+
+            if args.do_jit:
+                outputs = QuestionAnsweringModelOutput(start_logits = outputs["start_logits"],
+                    end_logits = outputs["end_logits"])
+            if iter_num == 10:
+                if do_profiling:
+                    profiling_path = os.getenv('PROFILE_PATH')
+                    # if no set PROFILE_PATH, use current dir
+                    if profiling_path == None:
+                        profiling_path = './'
+                    if (args.profiling_sub_file != ""):
+                        profiling_path += args.profiling_sub_file
+                    mkdir(profiling_path)
+                    torch.save(prof.key_averages().table(sort_by="self_xpu_time_total"), \
+                        profiling_path + '/bert_inference_profile.pt')
+                    print(prof.key_averages().table(sort_by="self_xpu_time_total"))
+                    torch.save(prof.table(sort_by="id", row_limit=100000), \
+                        profiling_path + '/bert_inference_profile_detailed.pt')
+                    prof.export_chrome_trace(profiling_path + '/bert_inference_profile.json')
+
+        for i, feature_index in enumerate(feature_indices):
+            eval_feature = features[feature_index.item()]
+            unique_id = int(eval_feature.unique_id)
+
+            output = [to_list(output[i]) for output in outputs.to_tuple()]
+
+            # Some models (XLNet, XLM) use 5 arguments for their predictions, while the other "simpler"
+            # models only use two.
+            if len(output) >= 5:
+                start_logits = output[0]
+                start_top_index = output[1]
+                end_logits = output[2]
+                end_top_index = output[3]
+                cls_logits = output[4]
+
+                result = SquadResult(
+                    unique_id,
+                    start_logits,
+                    end_logits,
+                    start_top_index=start_top_index,
+                    end_top_index=end_top_index,
+                    cls_logits=cls_logits,
+                )
+
+            else:
+                start_logits, end_logits = output
+                result = SquadResult(unique_id, start_logits, end_logits)
+
+            all_results.append(result)
+        iter_num += 1
+        if args.num_iterations != -1 and iter_num >= args.num_iterations:
+            break;
+
+    evalTime = time.time() - start_time
+    if len(time_collect) > 0:
+        avg_time = sum(time_collect)/len(time_collect)
+        print("bert_inf latency: ", avg_time, ' s')
+        print("bert_inf throughput: ", args.eval_batch_size/avg_time, ' sentences/s')
+    if args.num_iterations != -1 and args.num_iterations < (len(dataset)/args.eval_batch_size + 1):
+        exit()
+    logger.info(
+        "  Evaluation done in total %f secs (%f sec per example)",
+        evalTime,
+        evalTime / len(dataset),
+    )
+
+    # Compute predictions
+    output_prediction_file = os.path.join(
+        args.output_dir, "predictions_{}.json".format(prefix)
+    )
+    output_nbest_file = os.path.join(
+        args.output_dir, "nbest_predictions_{}.json".format(prefix)
+    )
+
+    if args.version_2_with_negative:
+        output_null_log_odds_file = os.path.join(
+            args.output_dir, "null_odds_{}.json".format(prefix)
+        )
+    else:
+        output_null_log_odds_file = None
+
+    # XLNet and XLM use a more complex post-processing procedure
+    if args.model_type in ["xlnet", "xlm"]:
+        start_n_top = (
+            model.config.start_n_top
+            if hasattr(model, "config")
+            else model.module.config.start_n_top
+        )
+        end_n_top = (
+            model.config.end_n_top
+            if hasattr(model, "config")
+            else model.module.config.end_n_top
+        )
+
+        predictions = compute_predictions_log_probs(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            start_n_top,
+            end_n_top,
+            args.version_2_with_negative,
+            tokenizer,
+            args.verbose_logging,
+        )
+    else:
+        predictions = compute_predictions_logits(
+            examples,
+            features,
+            all_results,
+            args.n_best_size,
+            args.max_answer_length,
+            args.do_lower_case,
+            output_prediction_file,
+            output_nbest_file,
+            output_null_log_odds_file,
+            args.verbose_logging,
+            args.version_2_with_negative,
+            args.null_score_diff_threshold,
+            tokenizer,
+        )
+
+    # Compute the F1 and exact scores.
+    results = squad_evaluate(examples, predictions)
+    return results
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+        torch.distributed.barrier()
+
+    # Load data features from cache or dataset file
+    input_dir = args.data_dir if args.data_dir else "."
+    cached_features_file = os.path.join(
+        input_dir,
+        "cached_{}_{}_{}".format(
+            "dev" if evaluate else "train",
+            list(filter(None, args.model_name_or_path.split("/"))).pop(),
+            str(args.max_seq_length),
+        ),
+    )
+
+    # Init features and dataset from cache if it exists
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features_and_dataset = torch.load(cached_features_file)
+        features, dataset, examples = (
+            features_and_dataset["features"],
+            features_and_dataset["dataset"],
+            features_and_dataset["examples"],
+        )
+    else:
+        logger.info("Creating features from dataset file at %s", input_dir)
+
+        if not args.data_dir and (
+            (evaluate and not args.predict_file)
+            or (not evaluate and not args.train_file)
+        ):
+            try:
+                import tensorflow_datasets as tfds
+            except ImportError:
+                raise ImportError(
+                    "If not data_dir is specified, tensorflow_datasets needs to be installed."
+                )
+
+            if args.version_2_with_negative:
+                logger.warn("tensorflow_datasets does not handle version 2 of SQuAD.")
+
+            tfds_examples = tfds.load("squad")
+            examples = SquadV1Processor().get_examples_from_dataset(
+                tfds_examples, evaluate=evaluate
+            )
+        else:
+            processor = (
+                SquadV2Processor()
+                if args.version_2_with_negative
+                else SquadV1Processor()
+            )
+            if evaluate:
+                examples = processor.get_dev_examples(
+                    args.data_dir, filename=args.predict_file
+                )
+            else:
+                examples = processor.get_train_examples(
+                    args.data_dir, filename=args.train_file
+                )
+
+        features, dataset = squad_convert_examples_to_features(
+            examples=examples,
+            tokenizer=tokenizer,
+            max_seq_length=args.max_seq_length,
+            doc_stride=args.doc_stride,
+            max_query_length=args.max_query_length,
+            is_training=not evaluate,
+            return_dataset="pt",
+            threads=args.threads,
+        )
+
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(
+                {"features": features, "dataset": dataset, "examples": examples},
+                cached_features_file,
+            )
+
+    if args.local_rank == 0 and not evaluate:
+        # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+        torch.distributed.barrier()
+
+    if output_examples:
+        return dataset, examples, features
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--model_type",
+        default=None,
+        type=str,
+        required=True,
+        help="Model type selected in the list: " + ", ".join(MODEL_TYPES),
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default=None,
+        type=str,
+        required=True,
+        help="Path to pretrained model or model identifier from huggingface.co/models",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints and predictions will be written.",
+    )
+
+    # Other parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        help="The input data dir. Should contain the .json files for the task."
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--train_file",
+        default=None,
+        type=str,
+        help="The input training file. If a data dir is specified, will look for the file there"
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--predict_file",
+        default=None,
+        type=str,
+        help="The input evaluation file. If a data dir is specified, will look for the file there"
+        + "If no data dir or train/predict files are specified, will run with tensorflow_datasets.",
+    )
+    parser.add_argument(
+        "--config_name",
+        default="",
+        type=str,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--tokenizer_name",
+        default="",
+        type=str,
+        help="Pretrained tokenizer name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--cache_dir",
+        default="",
+        type=str,
+        help="Where do you want to store the pre-trained models downloaded from huggingface.co",
+    )
+
+    parser.add_argument(
+        "--dtype",
+        default="FP32",
+        type=str,
+        choices=["FP32", "BF16", "FP16"],
+        help= "Specify precision to use",
+    )
+
+    parser.add_argument(
+        "--version_2_with_negative",
+        action="store_true",
+        help="If true, the SQuAD examples contain some that do not have an answer.",
+    )
+    parser.add_argument(
+        "--null_score_diff_threshold",
+        type=float,
+        default=0.0,
+        help="If null_score - best_non_null is greater than the threshold predict null.",
+    )
+
+    parser.add_argument(
+        "--max_seq_length",
+        default=384,
+        type=int,
+        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+        "longer than this will be truncated, and sequences shorter than this will be padded.",
+    )
+    parser.add_argument(
+        "--doc_stride",
+        default=128,
+        type=int,
+        help="When splitting up a long document into chunks, how much stride to take between chunks.",
+    )
+    parser.add_argument(
+        "--max_query_length",
+        default=64,
+        type=int,
+        help="The maximum number of tokens for the question. Questions longer than this will "
+        "be truncated to this length.",
+    )
+    parser.add_argument(
+        "--do_train", action="store_true", help="Whether to run training."
+    )
+    parser.add_argument(
+        "--do_eval", action="store_true", help="Whether to run eval on the dev set."
+    )
+    parser.add_argument(
+        "--do_jit", action="store_true", help="Whether to run eval with jit on the dev set."
+    )
+    parser.add_argument(
+        "--evaluate_during_training",
+        action="store_true",
+        help="Run evaluation during training at each logging step.",
+    )
+    parser.add_argument(
+        "--do_lower_case",
+        action="store_true",
+        help="Set this flag if you are using an uncased model.",
+    )
+    parser.add_argument(
+        "--per_gpu_train_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
+    )
+    parser.add_argument(
+        "--per_gpu_eval_batch_size",
+        default=8,
+        type=int,
+        help="Batch size per GPU/CPU for evaluation.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate for Adam.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--num_iterations",
+        type=int,
+        default=-1,
+        help="Number of iterations to run evaluation.",
+    )
+    parser.add_argument(
+        "--weight_decay", default=0.0, type=float, help="Weight decay if we apply some."
+    )
+    parser.add_argument(
+        "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer."
+    )
+    parser.add_argument(
+        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        default=3.0,
+        type=float,
+        help="Total number of training epochs to perform.",
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps."
+    )
+    parser.add_argument(
+        "--n_best_size",
+        default=20,
+        type=int,
+        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.",
+    )
+    parser.add_argument(
+        "--max_answer_length",
+        default=30,
+        type=int,
+        help="The maximum length of an answer that can be generated. This is needed because the start "
+        "and end predictions are not conditioned on one another.",
+    )
+    parser.add_argument(
+        "--verbose_logging",
+        action="store_true",
+        help="If true, all of the warnings related to data processing will be printed. "
+        "A number of warnings are expected for a normal SQuAD evaluation.",
+    )
+    parser.add_argument(
+        "--lang_id",
+        default=0,
+        type=int,
+        help="language id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)",
+    )
+
+    parser.add_argument(
+        "--logging_steps", type=int, default=500, help="Log every X updates steps."
+    )
+    parser.add_argument(
+        "--save_steps",
+        type=int,
+        default=500,
+        help="Save checkpoint every X updates steps.",
+    )
+    parser.add_argument(
+        "--eval_all_checkpoints",
+        action="store_true",
+        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number",
+    )
+    parser.add_argument(
+        "--device_choice",
+        default="cpu",
+        type=str,
+        choices=["cpu", "xpu", "cuda"],
+        help= "Specify device to use when available",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        help="Whether to use PCL Fused impl when available",
+    )
+    parser.add_argument(
+        "--use_pcl",
+        action="store_true",
+        help="Whether to use PCL Fused impl when available",
+    )
+    parser.add_argument(
+        "--pcl_bf16",
+        action="store_true",
+        help="Whether to use PCL Fused impl when available",
+    )
+    parser.add_argument(
+        "--unpad",
+        action="store_true",
+        help="Whether to use PCL Fused impl when available",
+    )
+    parser.add_argument(
+        "--dist_backend",
+        type=str,
+        default="ccl",
+        help="Specify distributed backend to use.",
+    )
+    parser.add_argument(
+        "--profiling_sub_file",
+        type=str,
+        default="",
+        help="Path to saved torch profiler",
+    )
+    parser.add_argument(
+        "--overwrite_output_dir",
+        action="store_true",
+        help="Overwrite the content of the output directory",
+    )
+    parser.add_argument(
+        "--overwrite_cache",
+        action="store_true",
+        help="Overwrite the cached training and evaluation sets",
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="random seed for initialization"
+    )
+
+    parser.add_argument(
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="local_rank for distributed training on gpus",
+    )
+    parser.add_argument(
+        "--fp16_opt_level",
+        type=str,
+        default="O1",
+        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+        "See details at https://nvidia.github.io/apex/amp.html",
+    )
+    parser.add_argument(
+        "--server_ip", type=str, default="", help="Can be used for distant debugging."
+    )
+    parser.add_argument(
+        "--server_port", type=str, default="", help="Can be used for distant debugging."
+    )
+
+    parser.add_argument(
+        "--threads",
+        type=int,
+        default=1,
+        help="multiple threads for converting example to features",
+    )
+    args = parser.parse_args()
+
+    if args.doc_stride >= args.max_seq_length - args.max_query_length:
+        logger.warning(
+            "WARNING - You've set a doc stride which may be superior to the document length in some "
+            "examples. This could result in errors when building features from the examples. Please reduce the doc "
+            "stride or increase the maximum length to ensure the features are correctly built."
+        )
+
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir
+            )
+        )
+
+    if args.use_pcl:
+        try:
+            from pcl_pytorch_extension import bert as pcl_bert
+        except:
+            print("CCL backend requested but import torch_ccl failed")
+            raise
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(
+            address=(args.server_ip, args.server_port), redirect_output=True
+        )
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if int(os.environ.get("PMI_SIZE", "0")) > 1:
+        if args.dist_backend == "ccl":
+            try:
+                import torch_ccl
+            except:
+                print("CCL backend requested but import torch_ccl failed")
+                raise
+        elif args.dist_backend == "mpi":
+            if not torch.distributed.is_mpi_available():
+                try:
+                    import torch_mpi
+                except:
+                    print(
+                        "MPI backend requested but not available try installing torch_mpi module"
+                    )
+                    raise
+        else:
+            raise ValueError(f"{args.dist_backend} backend requested but not supported")
+
+        os.environ["RANK"] = os.environ.get("PMI_RANK", "0")
+        os.environ["WORLD_SIZE"] = os.environ.get("PMI_SIZE", "1")
+        torch.distributed.init_process_group(backend=args.dist_backend)
+        device = torch.device("cpu")
+        args.n_gpu = 0
+        args.local_rank = torch.distributed.get_rank()
+        print(
+            f"Using {args.dist_backend.upper()} dist run with {torch.distributed.get_world_size()} ranks"
+        )
+    elif args.local_rank == -1:
+        device, args.n_gpu = get_device(args.device_choice)
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, dtype: %s",
+        args.local_rank,
+        device,
+        args.n_gpu,
+        bool(args.local_rank != -1),
+        args.dtype,
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    if is_main_process(args.local_rank):
+        transformers.utils.logging.set_verbosity_info()
+        transformers.utils.logging.enable_default_handler()
+        transformers.utils.logging.enable_explicit_format()
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        # Make sure only the first process in distributed training will download model & vocab
+        torch.distributed.barrier()
+
+    args.model_type = args.model_type.lower()
+    config = AutoConfig.from_pretrained(
+        args.config_name if args.config_name else args.model_name_or_path,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+        do_lower_case=args.do_lower_case,
+        cache_dir=args.cache_dir if args.cache_dir else None,
+        use_fast=False,  # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
+    )
+    model = load_model(args, config = config)
+
+    for m in model.modules():
+        if hasattr(m, "maybe_block_params"):
+            m.maybe_block_params()
+            # if args.pcl_bf16: m.to(torch.bfloat16)
+
+    if args.local_rank == 0:
+        # Make sure only the first process in distributed training will download model & vocab
+        torch.distributed.barrier()
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.dtype is float16.
+    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
+    # remove the need for this code, but it is still valid.
+    if (args.dtype == "FP16") and (args.device_choice == "cuda"):
+        try:
+            import apex
+
+            apex.amp.register_half_function(torch, "einsum")
+        except ImportError:
+            raise ImportError(
+                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
+            )
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(
+            args, tokenizer, evaluate=False, output_examples=False
+        )
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+    # Save the trained model and the tokenizer
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        # Take care of distributed/parallel training
+        model_to_save = model.module if hasattr(model, "module") else model
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = AutoModelForQuestionAnswering.from_pretrained(
+            args.output_dir
+        )  # , force_download=True)
+
+        # SquadDataset is not compatible with Fast tokenizers which have a smarter overflow handeling
+        # So we use use_fast=False here for now until Fast-tokenizer-compatible-examples are out
+        tokenizer = AutoTokenizer.from_pretrained(
+            args.output_dir, do_lower_case=args.do_lower_case, use_fast=False
+        )
+        model.to(args.device)
+
+    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        if args.do_train:
+            logger.info("Loading checkpoints saved during training for evaluation")
+            checkpoints = [args.output_dir]
+            if args.eval_all_checkpoints:
+                checkpoints = list(
+                    os.path.dirname(c)
+                    for c in sorted(
+                        glob.glob(
+                            args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True
+                        )
+                    )
+                )
+
+        else:
+            logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
+            checkpoints = [args.model_name_or_path]
+
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+
+        for checkpoint in checkpoints:
+            # Reload the model
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            model = load_model(args, checkpoint = checkpoint)
+
+            # Evaluate
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+
+            result = dict(
+                (k + ("_{}".format(global_step) if global_step else ""), v)
+                for k, v in result.items()
+            )
+            results.update(result)
+
+    logger.info("Results: {}".format(results))
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/.gitignore b/models/language_modeling/pytorch/bert_large/training/gpu/.gitignore
new file mode 100644
index 000000000..d1b4845dc
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/.gitignore
@@ -0,0 +1,2 @@
+bert/
+*.tar.gz
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/bert_config.json b/models/language_modeling/pytorch/bert_large/training/gpu/bert_config.json
new file mode 100644
index 000000000..14c938cd2
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/bert_config.json
@@ -0,0 +1,14 @@
+{
+    "attention_probs_dropout_prob": 0.1,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 1024,
+    "initializer_range": 0.02,
+    "intermediate_size": 4096,
+    "max_position_embeddings": 512,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "type_vocab_size": 2,
+    "vocab_size": 30522,
+    "model_type": "bert"
+}
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/data/chop_hdf5_files.py b/models/language_modeling/pytorch/bert_large/training/gpu/data/chop_hdf5_files.py
new file mode 100644
index 000000000..439644689
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/data/chop_hdf5_files.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import h5py
+import multiprocessing
+import numpy as np
+from argparse import ArgumentParser, REMAINDER
+from argparse import RawTextHelpFormatter
+import os 
+
+hdf5_compression_method = None
+max_pred_per_seq = 76
+seq_length = 512 if "SEQ_LEN" not in os.environ else int(os.environ["SEQ_LEN"])
+n_output_shards = 2048 if "SHARD_NUM" not in os.environ else int(os.environ["SHARD_NUM"])
+input_path = 'hdf5_seq_{}'.format(seq_length)
+input_files = sorted(glob.glob(input_path + '/part*', recursive=False))
+print('n_input_shards =', len(input_files))
+
+print("#########seq_length############".format(seq_length))
+print("#########n_output_shards############".format(n_output_shards))
+
+if not os.path.exists('2048_shards_uncompressed_{}'.format(seq_length)):
+    os.mkdir('2048_shards_uncompressed_{}'.format(seq_length))
+
+ofile_prefix = '2048_shards_uncompressed_{}/part_'.format(seq_length)
+ofile_suffix = '_of_' + str(n_output_shards) + '.hdf5'
+
+print('n_output_shards =', n_output_shards)
+
+# First pass over data to get sample count (read only the smallest array to get count)
+n_samples = 0
+for idx, ifile in enumerate(input_files):
+  print("Scanning:", ifile, " --  Progress:", idx+1, '/', len(input_files))
+  h5_ifile = h5py.File(ifile, 'r')
+
+  f_next_sentence_labels = h5_ifile['next_sentence_labels'][:]
+
+  h5_ifile.close()
+  n_samples += f_next_sentence_labels.shape[0]
+  
+ 
+# Find a "nominal" number of samples per shard (calculated to always go over by one shard size)
+# Find excess samples in last shard and distribute removal of excess over first "N" shards (could be done over last, but it doesn't matter and math is easier this way)
+#  (since 0 <= excess < nominal_shard_size, the max imbalance will be 1 sample to minimize the straggler effect)
+n_sample_per_ofile_nominal = (n_samples + n_output_shards - 1) // n_output_shards
+n_excess = n_output_shards * n_sample_per_ofile_nominal - n_samples  # Always a positive number
+
+print("creating ", n_output_shards, " output file handles.  This could take a while.", flush=True)
+ofile_handles = [h5py.File(ofile_prefix + str(x) + ofile_suffix, 'w') for x in range(n_output_shards)]
+
+ofile_idx = 0  # which output file
+ofile_entry_idx = 0  # index into an individual data element of an output file
+ifile_entry_idx = 0
+
+n_samples_in_this_shard = n_sample_per_ofile_nominal - 1 
+o_input_ids = np.ndarray((n_samples_in_this_shard, seq_length))
+o_input_masks = np.ndarray((n_samples_in_this_shard, seq_length))
+o_segment_ids = np.ndarray((n_samples_in_this_shard, seq_length))
+o_masked_lm_positions = np.ndarray((n_samples_in_this_shard, max_pred_per_seq))
+o_masked_lm_ids = np.ndarray((n_samples_in_this_shard, max_pred_per_seq))
+o_next_sentence_labels = np.ndarray((n_samples_in_this_shard))
+
+for ifile in input_files:
+  print("Processing:", ifile, " --  Progress:", idx+1, '/', len(input_files))
+  h5_ifile = h5py.File(ifile, 'r')
+  
+  ifile_entry_idx = 0
+  f_input_ids = h5_ifile['input_ids'][:]
+  f_input_masks = h5_ifile['input_mask'][:]
+  f_segment_ids = h5_ifile['segment_ids'][:]
+  f_masked_lm_positions = h5_ifile['masked_lm_positions'][:]
+  f_masked_lm_ids = h5_ifile['masked_lm_ids'][:]
+  f_next_sentence_labels = h5_ifile['next_sentence_labels'][:]
+
+  h5_ifile.close()
+
+  # This could be vectorized but keeping it simple due to lack of time
+  while ifile_entry_idx < f_input_ids.shape[0]:
+    if ofile_entry_idx == n_samples_in_this_shard:
+      ofile_handles[ofile_idx].create_dataset("input_ids",            data=o_input_ids,            dtype='i4', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].create_dataset("input_mask",           data=o_input_masks,          dtype='i1', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].create_dataset("segment_ids",          data=o_segment_ids,          dtype='i1', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].create_dataset("masked_lm_positions",  data=o_masked_lm_positions,  dtype='i4', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].create_dataset("masked_lm_ids",        data=o_masked_lm_ids,        dtype='i4', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].create_dataset("next_sentence_labels", data=o_next_sentence_labels, dtype='i1', compression=hdf5_compression_method)
+      ofile_handles[ofile_idx].flush()
+      ofile_handles[ofile_idx].close()
+
+      ofile_entry_idx = 0
+      ofile_idx += 1
+      print("Opening output idx:", ofile_idx)
+
+      n_samples_in_this_shard = n_sample_per_ofile_nominal
+      if ofile_entry_idx < n_excess:
+        n_samples_in_this_shard -= 1
+
+      o_input_ids = np.ndarray((n_samples_in_this_shard, seq_length))
+      o_input_masks = np.ndarray((n_samples_in_this_shard, seq_length))
+      o_segment_ids = np.ndarray((n_samples_in_this_shard, seq_length))
+      o_masked_lm_positions = np.ndarray((n_samples_in_this_shard, max_pred_per_seq))
+      o_masked_lm_ids = np.ndarray((n_samples_in_this_shard, max_pred_per_seq))
+      o_next_sentence_labels = np.ndarray((n_samples_in_this_shard))
+
+    o_input_ids[ofile_entry_idx] = f_input_ids[ifile_entry_idx]
+    o_input_masks[ofile_entry_idx] = f_input_masks[ifile_entry_idx]
+    o_segment_ids[ofile_entry_idx] = f_segment_ids[ifile_entry_idx]
+    o_masked_lm_positions[ofile_entry_idx] = f_masked_lm_positions[ifile_entry_idx]
+    o_masked_lm_ids[ofile_entry_idx] = f_masked_lm_ids[ifile_entry_idx]
+    o_next_sentence_labels[ofile_entry_idx] = f_next_sentence_labels[ifile_entry_idx]
+    ofile_entry_idx += 1
+
+    ifile_entry_idx += 1 
+
+if __name__ == '__main__':
+   parser = ArgumentParser(description="This is a script to parse the trace file")
+   parser.add_argument("--trace", metavar='\b', default="test_trace_10.json", type=str,
+                       help="The trace file. ")
+   args = parser.parse_args()
+   
+    
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/data/create_pretraining_data.py b/models/language_modeling/pytorch/bert_large/training/gpu/data/create_pretraining_data.py
new file mode 100644
index 000000000..95c872444
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/data/create_pretraining_data.py
@@ -0,0 +1,455 @@
+# coding=utf-8
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2020 MLBenchmark Group. All rights reserved.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Create masked LM/next sentence masked_lm TF examples for BERT."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import random
+import tokenization
+import tensorflow as tf
+
+import h5py
+import numpy as np
+
+hdf5_compression_method = None
+
+#flags = tf.flags
+flags = tf.compat.v1.flags
+
+FLAGS = flags.FLAGS
+
+flags.DEFINE_string("input_file", None,
+                    "Input raw text file (or comma-separated list of files).")
+
+flags.DEFINE_string(
+    "output_file", None,
+    "Output TF example file (or comma-separated list of files).")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer("max_seq_length", 128, "Maximum sequence length.")
+
+flags.DEFINE_integer("max_predictions_per_seq", 20,
+                     "Maximum number of masked LM predictions per sequence.")
+
+flags.DEFINE_integer("random_seed", 12345, "Random seed for data generation.")
+
+flags.DEFINE_integer(
+    "dupe_factor", 10,
+    "Number of times to duplicate the input data (with different masks).")
+
+flags.DEFINE_float("masked_lm_prob", 0.15, "Masked LM probability.")
+
+flags.DEFINE_float(
+    "short_seq_prob", 0.1,
+    "Probability of creating sequences which are shorter than the "
+    "maximum length.")
+
+
+class TrainingInstance(object):
+  """A single training instance (sentence pair)."""
+
+  def __init__(self, tokens, segment_ids, masked_lm_positions, masked_lm_labels,
+               is_random_next):
+    self.tokens = tokens
+    self.segment_ids = segment_ids
+    self.is_random_next = is_random_next
+    self.masked_lm_positions = masked_lm_positions
+    self.masked_lm_labels = masked_lm_labels
+
+  def __str__(self):
+    s = ""
+    s += "tokens: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.tokens]))
+    s += "segment_ids: %s\n" % (" ".join([str(x) for x in self.segment_ids]))
+    s += "is_random_next: %s\n" % self.is_random_next
+    s += "masked_lm_positions: %s\n" % (" ".join(
+        [str(x) for x in self.masked_lm_positions]))
+    s += "masked_lm_labels: %s\n" % (" ".join(
+        [tokenization.printable_text(x) for x in self.masked_lm_labels]))
+    s += "\n"
+    return s
+
+  def __repr__(self):
+    return self.__str__()
+
+
+def write_instance_to_example_files(instances, tokenizer, max_seq_length,
+                                    max_predictions_per_seq, output_files):
+  """Create TF example files from `TrainingInstance`s."""
+  writers = []
+  h5_writers = []
+  
+  expected_instances_per_file = len(instances) // len(output_files) + 500    # Over-allocation to avoid resizing 
+  for output_file in output_files:
+    h5_writers.append({
+      'handle' : h5py.File(output_file + ".hdf5", 'w'),
+      'input_ids' : np.zeros([expected_instances_per_file, max_seq_length], dtype="int32"),
+      'input_mask' : np.zeros([expected_instances_per_file, max_seq_length], dtype="int32"),
+      'segment_ids' : np.zeros([expected_instances_per_file, max_seq_length], dtype="int32"),
+      'masked_lm_positions' : np.zeros([expected_instances_per_file, max_predictions_per_seq], dtype="int32"),
+      'masked_lm_ids' : np.zeros([expected_instances_per_file, max_predictions_per_seq], dtype="int32"),
+      'next_sentence_labels' : np.zeros(expected_instances_per_file, dtype="int32"),
+      'len' : 0 })
+
+  writer_index = 0
+
+  total_written = 0
+
+  features_h5 = collections.OrderedDict()
+
+  for (inst_index, instance) in enumerate(instances):
+    input_ids = tokenizer.convert_tokens_to_ids(instance.tokens)
+    input_mask = [1] * len(input_ids)
+    segment_ids = list(instance.segment_ids)
+    assert len(input_ids) <= max_seq_length
+
+    while len(input_ids) < max_seq_length:
+      input_ids.append(0)
+      input_mask.append(0)
+      segment_ids.append(0)
+
+    assert len(input_ids) == max_seq_length
+    assert len(input_mask) == max_seq_length
+    assert len(segment_ids) == max_seq_length
+
+    masked_lm_positions = list(instance.masked_lm_positions)
+    masked_lm_ids = tokenizer.convert_tokens_to_ids(instance.masked_lm_labels)
+    masked_lm_weights = [1.0] * len(masked_lm_ids)
+
+    while len(masked_lm_positions) < max_predictions_per_seq:
+      masked_lm_positions.append(0)
+      masked_lm_ids.append(0)
+      masked_lm_weights.append(0.0)
+
+    next_sentence_label = 1 if instance.is_random_next else 0
+    
+    h5_writers[writer_index]['input_ids'][inst_index] = input_ids
+    h5_writers[writer_index]['input_mask'][inst_index] = input_mask
+    h5_writers[writer_index]['segment_ids'][inst_index] = segment_ids
+    h5_writers[writer_index]['masked_lm_positions'][inst_index] = masked_lm_positions
+    h5_writers[writer_index]['masked_lm_ids'][inst_index] = masked_lm_ids
+    h5_writers[writer_index]['next_sentence_labels'][inst_index] = next_sentence_label
+    h5_writers[writer_index]['len'] += 1
+
+    writer_index = (writer_index + 1) % len(h5_writers)
+
+    total_written += 1
+
+    if inst_index < 20:
+      tf.compat.v1.logging.info("*** Example ***")
+      tf.compat.v1.logging.info("tokens: %s" % " ".join(
+          [tokenization.printable_text(x) for x in instance.tokens]))
+  
+  print("saving data")
+  for h5_writer in h5_writers:
+    my_size = h5_writer['len']
+    h5_writer['handle'].create_dataset('input_ids', data=h5_writer['input_ids'][:my_size], dtype='i4', compression=hdf5_compression_method)
+    h5_writer['handle'].create_dataset('input_mask', data=h5_writer['input_mask'][:my_size], dtype='i1', compression=hdf5_compression_method)
+    h5_writer['handle'].create_dataset('segment_ids', data=h5_writer['segment_ids'][:my_size], dtype='i1', compression=hdf5_compression_method)
+    h5_writer['handle'].create_dataset('masked_lm_positions', data=h5_writer['masked_lm_positions'][:my_size], dtype='i4', compression=hdf5_compression_method)
+    h5_writer['handle'].create_dataset('masked_lm_ids', data=h5_writer['masked_lm_ids'][:my_size], dtype='i4', compression=hdf5_compression_method)
+    h5_writer['handle'].create_dataset('next_sentence_labels', data=h5_writer['next_sentence_labels'][:my_size], dtype='i1', compression=hdf5_compression_method)
+    h5_writer['handle'].flush()
+    h5_writer['handle'].close()
+
+  tf.compat.v1.logging.info("Wrote %d total instances", total_written)
+
+
+def create_int_feature(values):
+  feature = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+  return feature
+
+def create_float_feature(values):
+  feature = tf.train.Feature(float_list=tf.train.FloatList(value=list(values)))
+  return feature
+
+def create_training_instances(input_files, tokenizer, max_seq_length,
+                              dupe_factor, short_seq_prob, masked_lm_prob,
+                              max_predictions_per_seq, rng):
+  """Create `TrainingInstance`s from raw text."""
+  all_documents = [[]]
+
+  # Input file format:
+  # (1) One sentence per line. These should ideally be actual sentences, not
+  # entire paragraphs or arbitrary spans of text. (Because we use the
+  # sentence boundaries for the "next sentence prediction" task).
+  # (2) Blank lines between documents. Document boundaries are needed so
+  # that the "next sentence prediction" task doesn't span between documents.
+  for input_file in input_files:
+    with tf.compat.v1.gfile.GFile(input_file, "r") as reader:
+      while True:
+        line = tokenization.convert_to_unicode(reader.readline())
+        if not line:
+          break
+        line = line.strip()
+
+        # Empty lines are used as document delimiters
+        if not line:
+          all_documents.append([])
+        tokens = tokenizer.tokenize(line)
+        if tokens:
+          all_documents[-1].append(tokens)
+
+  # Remove empty documents
+  all_documents = [x for x in all_documents if x]
+  rng.shuffle(all_documents)
+
+  vocab_words = list(tokenizer.vocab.keys())
+  instances = []
+  for _ in range(dupe_factor):
+    for document_index in range(len(all_documents)):
+      instances.extend(
+          create_instances_from_document(
+              all_documents, document_index, max_seq_length, short_seq_prob,
+              masked_lm_prob, max_predictions_per_seq, vocab_words, rng))
+
+  rng.shuffle(instances)
+  return instances
+
+
+def create_instances_from_document(
+    all_documents, document_index, max_seq_length, short_seq_prob,
+    masked_lm_prob, max_predictions_per_seq, vocab_words, rng):
+  """Creates `TrainingInstance`s for a single document."""
+  document = all_documents[document_index]
+
+  # Account for [CLS], [SEP], [SEP]
+  max_num_tokens = max_seq_length - 3
+
+  # We *usually* want to fill up the entire sequence since we are padding
+  # to `max_seq_length` anyways, so short sequences are generally wasted
+  # computation. However, we *sometimes*
+  # (i.e., short_seq_prob == 0.1 == 10% of the time) want to use shorter
+  # sequences to minimize the mismatch between pre-training and fine-tuning.
+  # The `target_seq_length` is just a rough target however, whereas
+  # `max_seq_length` is a hard limit.
+  target_seq_length = max_num_tokens
+  if rng.random() < short_seq_prob:
+    target_seq_length = rng.randint(2, max_num_tokens)
+
+  # We DON'T just concatenate all of the tokens from a document into a long
+  # sequence and choose an arbitrary split point because this would make the
+  # next sentence prediction task too easy. Instead, we split the input into
+  # segments "A" and "B" based on the actual "sentences" provided by the user
+  # input.
+  instances = []
+  current_chunk = []
+  current_length = 0
+  i = 0
+  while i < len(document):
+    segment = document[i]
+    current_chunk.append(segment)
+    current_length += len(segment)
+    if i == len(document) - 1 or current_length >= target_seq_length:
+      if current_chunk:
+        # `a_end` is how many segments from `current_chunk` go into the `A`
+        # (first) sentence.
+        a_end = 1
+        if len(current_chunk) >= 2:
+          a_end = rng.randint(1, len(current_chunk) - 1)
+
+        tokens_a = []
+        for j in range(a_end):
+          tokens_a.extend(current_chunk[j])
+
+        tokens_b = []
+        # Random next
+        is_random_next = False
+        if len(current_chunk) == 1 or rng.random() < 0.5:
+          is_random_next = True
+          target_b_length = target_seq_length - len(tokens_a)
+
+          # This should rarely go for more than one iteration for large
+          # corpora. However, just to be careful, we try to make sure that
+          # the random document is not the same as the document
+          # we're processing.
+          for _ in range(10):
+            random_document_index = rng.randint(0, len(all_documents) - 1)
+            if random_document_index != document_index:
+              break
+
+          random_document = all_documents[random_document_index]
+          random_start = rng.randint(0, len(random_document) - 1)
+          for j in range(random_start, len(random_document)):
+            tokens_b.extend(random_document[j])
+            if len(tokens_b) >= target_b_length:
+              break
+          # We didn't actually use these segments so we "put them back" so
+          # they don't go to waste.
+          num_unused_segments = len(current_chunk) - a_end
+          i -= num_unused_segments
+        # Actual next
+        else:
+          is_random_next = False
+          for j in range(a_end, len(current_chunk)):
+            tokens_b.extend(current_chunk[j])
+        truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng)
+
+        assert len(tokens_a) >= 1
+        assert len(tokens_b) >= 1
+
+        tokens = []
+        segment_ids = []
+        tokens.append("[CLS]")
+        segment_ids.append(0)
+        for token in tokens_a:
+          tokens.append(token)
+          segment_ids.append(0)
+
+        tokens.append("[SEP]")
+        segment_ids.append(0)
+
+        for token in tokens_b:
+          tokens.append(token)
+          segment_ids.append(1)
+        tokens.append("[SEP]")
+        segment_ids.append(1)
+
+        (tokens, masked_lm_positions,
+         masked_lm_labels) = create_masked_lm_predictions(
+             tokens, masked_lm_prob, max_predictions_per_seq, vocab_words, rng)
+        instance = TrainingInstance(
+            tokens=tokens,
+            segment_ids=segment_ids,
+            is_random_next=is_random_next,
+            masked_lm_positions=masked_lm_positions,
+            masked_lm_labels=masked_lm_labels)
+        instances.append(instance)
+      current_chunk = []
+      current_length = 0
+    i += 1
+
+  return instances
+
+MaskedLmInstance = collections.namedtuple("MaskedLmInstance",
+                                          ["index", "label"])
+
+def create_masked_lm_predictions(tokens, masked_lm_prob,
+                                 max_predictions_per_seq, vocab_words, rng):
+  """Creates the predictions for the masked LM objective."""
+
+  cand_indexes = []
+  for (i, token) in enumerate(tokens):
+    if token == "[CLS]" or token == "[SEP]":
+      continue
+    cand_indexes.append(i)
+
+  rng.shuffle(cand_indexes)
+
+  output_tokens = list(tokens)
+
+  num_to_predict = min(max_predictions_per_seq,
+                       max(1, int(round(len(tokens) * masked_lm_prob))))
+
+  masked_lms = []
+  covered_indexes = set()
+  for index in cand_indexes:
+    if len(masked_lms) >= num_to_predict:
+      break
+    if index in covered_indexes:
+      continue
+    covered_indexes.add(index)
+
+    masked_token = None
+    # 80% of the time, replace with [MASK]
+    if rng.random() < 0.8:
+      masked_token = "[MASK]"
+    else:
+      # 10% of the time, keep original
+      if rng.random() < 0.5:
+        masked_token = tokens[index]
+      # 10% of the time, replace with random word
+      else:
+        masked_token = vocab_words[rng.randint(0, len(vocab_words) - 1)]
+
+    output_tokens[index] = masked_token
+
+    masked_lms.append(MaskedLmInstance(index=index, label=tokens[index]))
+
+  masked_lms = sorted(masked_lms, key=lambda x: x.index)
+
+  masked_lm_positions = []
+  masked_lm_labels = []
+  for p in masked_lms:
+    masked_lm_positions.append(p.index)
+    masked_lm_labels.append(p.label)
+
+  return (output_tokens, masked_lm_positions, masked_lm_labels)
+
+
+def truncate_seq_pair(tokens_a, tokens_b, max_num_tokens, rng):
+  """Truncates a pair of sequences to a maximum sequence length."""
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_num_tokens:
+      break
+
+    trunc_tokens = tokens_a if len(tokens_a) > len(tokens_b) else tokens_b
+    assert len(trunc_tokens) >= 1
+
+    # We want to sometimes truncate from the front and sometimes from the
+    # back to add more randomness and avoid biases.
+    if rng.random() < 0.5:
+      del trunc_tokens[0]
+    else:
+      trunc_tokens.pop()
+
+
+def main(_):
+  tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.INFO)
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  input_files = []
+  for input_pattern in FLAGS.input_file.split(","):
+    input_files.extend(tf.compat.v1.gfile.Glob(input_pattern))
+  
+  tf.compat.v1.logging.info("*** Reading from input files ***")
+  for input_file in input_files:
+    tf.compat.v1.logging.info("  %s", input_file)
+
+  rng = random.Random(FLAGS.random_seed)
+  instances = create_training_instances(
+      input_files, tokenizer, FLAGS.max_seq_length, FLAGS.dupe_factor,
+      FLAGS.short_seq_prob, FLAGS.masked_lm_prob, FLAGS.max_predictions_per_seq,
+      rng)
+
+  output_files = FLAGS.output_file.split(",")
+  tf.compat.v1.logging.info("*** Writing to output files ***")
+  for output_file in output_files:
+    tf.compat.v1.logging.info("  %s", output_file)
+
+  write_instance_to_example_files(instances, tokenizer, FLAGS.max_seq_length,
+                                  FLAGS.max_predictions_per_seq, output_files)
+
+
+if __name__ == "__main__":
+  flags.mark_flag_as_required("input_file")
+  flags.mark_flag_as_required("output_file")
+  flags.mark_flag_as_required("vocab_file")
+  tf.compat.v1.app.run()
\ No newline at end of file
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/data/create_pretraining_data_wrapper.sh b/models/language_modeling/pytorch/bert_large/training/gpu/data/create_pretraining_data_wrapper.sh
new file mode 100755
index 000000000..8f637bf87
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/data/create_pretraining_data_wrapper.sh
@@ -0,0 +1,46 @@
+
+#!/bin/bash
+# Copyright (c) 2022-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+input_path=${1}
+SEQ_LEN=${SEQ_LEN:-512}
+output_dir=${PROCESSED_DATASET_DIR}/"hdf5_seq_"${SEQ_LEN}
+input_file=$(basename $input_path)
+
+python3 ./create_pretraining_data.py \
+   --input_file=${input_path} \
+   --output_file="${output_dir}/${input_file}" \
+   --vocab_file=vocab.txt \
+   --do_lower_case=True \
+   --max_seq_length=$SEQ_LEN \
+   --max_predictions_per_seq=76 \
+   --masked_lm_prob=0.15 \
+   --random_seed=12345 \
+   --dupe_factor=10
+   
+   
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/data/parallel_create_hdf5.sh b/models/language_modeling/pytorch/bert_large/training/gpu/data/parallel_create_hdf5.sh
new file mode 100755
index 000000000..a9b1aef59
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/data/parallel_create_hdf5.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cpus=$( ls -d /sys/devices/system/cpu/cpu[[:digit:]]* | wc -w )
+cpus=$((cpus / 2))
+echo "Using $cpus CPU cores"
+SEQ_LEN=${SEQ_LEN:-512}
+mkdir -p ${PROCESSED_DATASET_DIR}/"hdf5_seq_"${SEQ_LEN}
+find -L ${DATASET_DIR} -name "part*" | xargs --max-args=1 --max-procs=$cpus  ./create_pretraining_data_wrapper.sh
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/data/parallel_create_hdf5_minidata.sh b/models/language_modeling/pytorch/bert_large/training/gpu/data/parallel_create_hdf5_minidata.sh
new file mode 100755
index 000000000..3b3612a1c
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/data/parallel_create_hdf5_minidata.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+cpus=$( ls -d /sys/devices/system/cpu/cpu[[:digit:]]* | wc -w )
+cpus=$((cpus / 2))
+echo "Using $cpus CPU cores"
+SEQ_LEN=${SEQ_LEN:-512}
+mkdir -p "hdf5_seq_"${SEQ_LEN}
+find -L mini-data/ -name "part*" | xargs --max-args=1 --max-procs=$cpus  ./create_pretraining_data_wrapper.sh
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/data/tokenization.py b/models/language_modeling/pytorch/bert_large/training/gpu/data/tokenization.py
new file mode 100644
index 000000000..72065d6bc
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/data/tokenization.py
@@ -0,0 +1,433 @@
+# coding=utf-8
+# Copyright 2020 MLBenchmark Group. All rights reserved.
+ 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tokenization classes."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import re
+import unicodedata
+
+from absl import flags
+import six
+import tensorflow.compat.v1 as tf
+
+FLAGS = tf.flags.FLAGS
+
+flags.DEFINE_bool(
+    "preserve_unused_tokens", False,
+    "If True, Wordpiece tokenization will not be applied to words in the vocab."
+)
+
+_UNUSED_TOKEN_RE = re.compile("^\\[unused\\d+\\]$")
+
+
+def preserve_token(token, vocab):
+  """Returns True if the token should forgo tokenization and be preserved."""
+  if not FLAGS.preserve_unused_tokens:
+    return False
+  if token not in vocab:
+    return False
+  return bool(_UNUSED_TOKEN_RE.search(token))
+
+
+def validate_case_matches_checkpoint(do_lower_case, init_checkpoint):
+  """Checks whether the casing config is consistent with the checkpoint name."""
+
+  # The casing has to be passed in by the user and there is no explicit check
+  # as to whether it matches the checkpoint. The casing information probably
+  # should have been stored in the bert_config.json file, but it's not, so
+  # we have to heuristically detect it to validate.
+
+  if not init_checkpoint:
+    return
+
+  m = re.match("^.*?([A-Za-z0-9_-]+)/bert_model.ckpt", init_checkpoint)
+  if m is None:
+    return
+
+  model_name = m.group(1)
+
+  lower_models = [
+      "uncased_L-24_H-1024_A-16", "uncased_L-12_H-768_A-12",
+      "multilingual_L-12_H-768_A-12", "chinese_L-12_H-768_A-12"
+  ]
+
+  cased_models = [
+      "cased_L-12_H-768_A-12", "cased_L-24_H-1024_A-16",
+      "multi_cased_L-12_H-768_A-12"
+  ]
+
+  is_bad_config = False
+  if model_name in lower_models and not do_lower_case:
+    is_bad_config = True
+    actual_flag = "False"
+    case_name = "lowercased"
+    opposite_flag = "True"
+
+  if model_name in cased_models and do_lower_case:
+    is_bad_config = True
+    actual_flag = "True"
+    case_name = "cased"
+    opposite_flag = "False"
+
+  if is_bad_config:
+    raise ValueError(
+        "You passed in `--do_lower_case=%s` with `--init_checkpoint=%s`. "
+        "However, `%s` seems to be a %s model, so you "
+        "should pass in `--do_lower_case=%s` so that the fine-tuning matches "
+        "how the model was pre-training. If this error is wrong, please "
+        "just comment out this check." % (actual_flag, init_checkpoint,
+                                          model_name, case_name, opposite_flag))
+
+
+def convert_to_unicode(text):
+  """Converts `text` to Unicode (if it's not already), assuming utf-8 input."""
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text.decode("utf-8", "ignore")
+    elif isinstance(text, unicode):
+      return text
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def printable_text(text):
+  """Returns text encoded in a way suitable for print or `tf.logging`."""
+
+  # These functions want `str` for both Python2 and Python3, but in one case
+  # it's a Unicode string and in the other it's a byte string.
+  if six.PY3:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, bytes):
+      return text.decode("utf-8", "ignore")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  elif six.PY2:
+    if isinstance(text, str):
+      return text
+    elif isinstance(text, unicode):
+      return text.encode("utf-8")
+    else:
+      raise ValueError("Unsupported string type: %s" % (type(text)))
+  else:
+    raise ValueError("Not running on Python2 or Python 3?")
+
+
+def load_vocab(vocab_file):
+  """Loads a vocabulary file into a dictionary."""
+  vocab = collections.OrderedDict()
+  with tf.gfile.GFile(vocab_file, "r") as reader:
+    while True:
+      token = convert_to_unicode(reader.readline())
+      if not token:
+        break
+      token = token.strip()
+      if token not in vocab:
+        vocab[token] = len(vocab)
+  return vocab
+
+
+def convert_by_vocab(vocab, items):
+  """Converts a sequence of [tokens|ids] using the vocab."""
+  output = []
+  for item in items:
+    output.append(vocab[item])
+  return output
+
+
+def convert_tokens_to_ids(vocab, tokens):
+  return convert_by_vocab(vocab, tokens)
+
+
+def convert_ids_to_tokens(inv_vocab, ids):
+  return convert_by_vocab(inv_vocab, ids)
+
+
+def whitespace_tokenize(text):
+  """Runs basic whitespace cleaning and splitting on a piece of text."""
+  text = text.strip()
+  if not text:
+    return []
+  tokens = text.split()
+  return tokens
+
+
+class FullTokenizer(object):
+  """Runs end-to-end tokenziation."""
+
+  def __init__(self, vocab_file, do_lower_case=True):
+    self.vocab = load_vocab(vocab_file)
+    self.inv_vocab = {v: k for k, v in self.vocab.items()}
+    self.basic_tokenizer = BasicTokenizer(
+        do_lower_case=do_lower_case, vocab=self.vocab)
+    self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab)
+
+  def tokenize(self, text):
+    split_tokens = []
+    for token in self.basic_tokenizer.tokenize(text):
+      if preserve_token(token, self.vocab):
+        split_tokens.append(token)
+        continue
+      for sub_token in self.wordpiece_tokenizer.tokenize(token):
+        split_tokens.append(sub_token)
+
+    return split_tokens
+
+  def convert_tokens_to_ids(self, tokens):
+    return convert_by_vocab(self.vocab, tokens)
+
+  def convert_ids_to_tokens(self, ids):
+    return convert_by_vocab(self.inv_vocab, ids)
+
+
+class BasicTokenizer(object):
+  """Runs basic tokenization (punctuation splitting, lower casing, etc.)."""
+
+  def __init__(self, do_lower_case=True, vocab=tuple()):
+    """Constructs a BasicTokenizer.
+    Args:
+      do_lower_case: Whether to lower case the input.
+      vocab: A container of tokens to not mutate during tokenization.
+    """
+    self.do_lower_case = do_lower_case
+    self.vocab = vocab
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text."""
+    text = convert_to_unicode(text)
+    text = self._clean_text(text)
+
+    # This was added on November 1st, 2018 for the multilingual and Chinese
+    # models. This is also applied to the English models now, but it doesn't
+    # matter since the English models were not trained on any Chinese data
+    # and generally don't have any Chinese data in them (there are Chinese
+    # characters in the vocabulary because Wikipedia does have some Chinese
+    # words in the English Wikipedia.).
+    text = self._tokenize_chinese_chars(text)
+
+    orig_tokens = whitespace_tokenize(text)
+    split_tokens = []
+    for token in orig_tokens:
+      if preserve_token(token, self.vocab):
+        split_tokens.append(token)
+        continue
+      if self.do_lower_case:
+        token = token.lower()
+        token = self._run_strip_accents(token)
+      split_tokens.extend(self._run_split_on_punc(token))
+
+    output_tokens = whitespace_tokenize(" ".join(split_tokens))
+    return output_tokens
+
+  def _run_strip_accents(self, text):
+    """Strips accents from a piece of text."""
+    text = unicodedata.normalize("NFD", text)
+    output = []
+    for char in text:
+      cat = unicodedata.category(char)
+      if cat == "Mn":
+        continue
+      output.append(char)
+    return "".join(output)
+
+  def _run_split_on_punc(self, text):
+    """Splits punctuation on a piece of text."""
+    chars = list(text)
+    i = 0
+    start_new_word = True
+    output = []
+    while i < len(chars):
+      char = chars[i]
+      if _is_punctuation(char):
+        output.append([char])
+        start_new_word = True
+      else:
+        if start_new_word:
+          output.append([])
+        start_new_word = False
+        output[-1].append(char)
+      i += 1
+
+    return ["".join(x) for x in output]
+
+  def _tokenize_chinese_chars(self, text):
+    """Adds whitespace around any CJK character."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if self._is_chinese_char(cp):
+        output.append(" ")
+        output.append(char)
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+  def _is_chinese_char(self, cp):
+    """Checks whether CP is the codepoint of a CJK character."""
+    # This defines a "chinese character" as anything in the CJK Unicode block:
+    #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+    #
+    # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+    # despite its name. The modern Korean Hangul alphabet is a different block,
+    # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+    # space-separated words, so they are not treated specially and handled
+    # like the all of the other languages.
+    if ((cp >= 0x4E00 and cp <= 0x9FFF) or  #
+        (cp >= 0x3400 and cp <= 0x4DBF) or  #
+        (cp >= 0x20000 and cp <= 0x2A6DF) or  #
+        (cp >= 0x2A700 and cp <= 0x2B73F) or  #
+        (cp >= 0x2B740 and cp <= 0x2B81F) or  #
+        (cp >= 0x2B820 and cp <= 0x2CEAF) or
+        (cp >= 0xF900 and cp <= 0xFAFF) or  #
+        (cp >= 0x2F800 and cp <= 0x2FA1F)):  #
+      return True
+
+    return False
+
+  def _clean_text(self, text):
+    """Performs invalid character removal and whitespace cleanup on text."""
+    output = []
+    for char in text:
+      cp = ord(char)
+      if cp == 0 or cp == 0xfffd or _is_control(char):
+        continue
+      if _is_whitespace(char):
+        output.append(" ")
+      else:
+        output.append(char)
+    return "".join(output)
+
+
+class WordpieceTokenizer(object):
+  """Runs WordPiece tokenziation."""
+
+  def __init__(self, vocab, unk_token="[UNK]", max_input_chars_per_word=200):
+    self.vocab = vocab
+    self.unk_token = unk_token
+    self.max_input_chars_per_word = max_input_chars_per_word
+
+  def tokenize(self, text):
+    """Tokenizes a piece of text into its word pieces.
+    This uses a greedy longest-match-first algorithm to perform tokenization
+    using the given vocabulary.
+    For example:
+      input = "unaffable"
+      output = ["un", "##aff", "##able"]
+    Args:
+      text: A single token or whitespace separated tokens. This should have
+        already been passed through `BasicTokenizer.
+    Returns:
+      A list of wordpiece tokens.
+    """
+
+    text = convert_to_unicode(text)
+
+    output_tokens = []
+    for token in whitespace_tokenize(text):
+      chars = list(token)
+      if len(chars) > self.max_input_chars_per_word:
+        output_tokens.append(self.unk_token)
+        continue
+
+      is_bad = False
+      start = 0
+      sub_tokens = []
+      while start < len(chars):
+        end = len(chars)
+        cur_substr = None
+        while start < end:
+          substr = "".join(chars[start:end])
+          if start > 0:
+            substr = "##" + substr
+          if substr in self.vocab:
+            cur_substr = substr
+            break
+          end -= 1
+        if cur_substr is None:
+          is_bad = True
+          break
+        sub_tokens.append(cur_substr)
+        start = end
+
+      if is_bad:
+        output_tokens.append(self.unk_token)
+      else:
+        output_tokens.extend(sub_tokens)
+    return output_tokens
+
+
+def _is_whitespace(char):
+  """Checks whether `chars` is a whitespace character."""
+  # \t, \n, and \r are technically control characters but we treat them
+  # as whitespace since they are generally considered as such.
+  if char == " " or char == "\t" or char == "\n" or char == "\r":
+    return True
+  cat = unicodedata.category(char)
+  if cat == "Zs":
+    return True
+  return False
+
+
+def _is_control(char):
+  """Checks whether `chars` is a control character."""
+  # These are technically control characters but we count them as whitespace
+  # characters.
+  if char == "\t" or char == "\n" or char == "\r":
+    return False
+  cat = unicodedata.category(char)
+  if cat in ("Cc", "Cf"):
+    return True
+  return False
+
+
+def _is_punctuation(char):
+  """Checks whether `chars` is a punctuation character."""
+  cp = ord(char)
+  # We treat all non-letter/number ASCII as punctuation.
+  # Characters such as "^", "$", and "`" are not in the Unicode
+  # Punctuation class but we treat them as punctuation anyways, for
+  # consistency.
+  if ((cp >= 33 and cp <= 47) or (cp >= 58 and cp <= 64) or
+      (cp >= 91 and cp <= 96) or (cp >= 123 and cp <= 126)):
+    return True
+  cat = unicodedata.category(char)
+  if cat.startswith("P"):
+    return True
+  return False
+
+
+# Just an example to use tokenizer
+if __name__ == '__main__':
+  tokenizer = FullTokenizer('vocab.txt')
+  text = 'We treat all non-letter/number ASCII as punctuation.'
+  tokens = tokenizer.tokenize(text)
+  ids = tokenizer.convert_tokens_to_ids(tokens)
+  print(tokens)
+  print(ids)
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/lamb.py b/models/language_modeling/pytorch/bert_large/training/gpu/lamb.py
new file mode 100644
index 000000000..4b5a9e132
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/lamb.py
@@ -0,0 +1,190 @@
+#
+# Copyright (c) 2022-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+"""Lamb optimizer."""
+
+import collections
+import math
+
+import torch
+from tensorboardX import SummaryWriter
+from torch.optim import Optimizer
+
+
+def log_lamb_rs(optimizer: Optimizer, event_writer: SummaryWriter, token_count: int):
+    """Log a histogram of trust ratio scalars in across layers."""
+    results = collections.defaultdict(list)
+    for group in optimizer.param_groups:
+        for p in group["params"]:
+            state = optimizer.state[p]
+            for i in ("weight_norm", "adam_norm", "trust_ratio"):
+                if i in state:
+                    results[i].append(state[i])
+
+    for k, v in results.items():
+        event_writer.add_histogram(f"lamb/{k}", torch.tensor(v), token_count)
+
+
+class Lamb(Optimizer):
+    r"""Implements Lamb algorithm.
+
+    It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
+
+    Arguments:
+        params (iterable): iterable of parameters to optimize or dicts defining
+            parameter groups
+        lr (float, optional): learning rate (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        adam (bool, optional): always use trust ratio = 1, which turns this into
+            Adam. Useful for comparison purposes.
+
+    .. _Large Batch Optimization for Deep Learning: Training BERT in 76 minutes:
+        https://arxiv.org/abs/1904.00962
+    """
+
+    def __init__(
+        self,
+        params,
+        lr=1e-3,
+        betas=(0.9, 0.999),
+        eps=1e-6,
+        weight_decay=0,
+        adam=False,
+        bias_correction=True,
+        perform_allreduce=False,
+    ):
+        if not 0.0 <= lr:
+            raise ValueError("Invalid learning rate: {}".format(lr))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {}".format(eps))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0]))
+        if not 0.0 <= betas[1] < 1.0:
+            raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1]))
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay)
+        self.adam = adam
+        self.bias_correction = bias_correction
+        self.perform_allreduce = perform_allreduce
+        self.distributed = (
+            torch.distributed.is_initialized()
+            and torch.distributed.get_world_size() > 1
+        )
+        super(Lamb, self).__init__(params, defaults)
+
+    def sync_params(self):
+        if not self.distributed:
+            return
+        for group in self.param_groups:
+            for p in group["params"]:
+                torch.distributed.broadcase(p.data, 0)
+
+    def sync_grads(self):
+        if not self.distributed:
+            return
+        world_size = torch.distributed.get_world_size()
+        for group in self.param_groups:
+            for p in group["params"]:
+                p.grad.data.div_(world_size)
+                torch.distributed.all_reduce(p.grad.data)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        if self.perform_allreduce:
+            self.sync_grads()
+
+        for group in self.param_groups:
+            for p in group["params"]:
+                if p.grad is None:
+                    continue
+                bf16_param = p.data.dtype == torch.bfloat16
+                grad = p.grad.data
+                data = p.data
+                if grad.is_sparse:
+                    raise RuntimeError(
+                        "Lamb does not support sparse gradients, consider SparseAdam instad."
+                    )
+
+                state = self.state[p]
+                # State initialization
+                if len(state) == 0:
+                    state["step"] = 0
+                    # Exponential moving average of gradient values
+                    state["exp_avg"] = torch.zeros_like(p.data, dtype=torch.float32)
+                    # Exponential moving average of squared gradient values
+                    state["exp_avg_sq"] = torch.zeros_like(p.data, dtype=torch.float32)
+                    if bf16_param:
+                        # additional fp32 version of master weights
+                        state["data_fp32"] = p.data.to(torch.float32)
+
+                exp_avg, exp_avg_sq = state["exp_avg"], state["exp_avg_sq"]
+                beta1, beta2 = group["betas"]
+                if bf16_param:
+                    grad = grad.to(torch.float32)
+                    data = state["data_fp32"]
+
+                state["step"] += 1
+
+                # Decay the first and second moment running average coefficient
+                # m_t
+                exp_avg.mul_(beta1).add_(grad, alpha=(1 - beta1))
+                # v_t
+                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
+
+                step_size = group["lr"]
+                if self.bias_correction:
+                    # Paper v3 does not use debiasing.
+                    exp_avg_hat = exp_avg / (1 - beta1 ** state["step"])
+                    exp_avg_sq_hat = exp_avg_sq / (1 - beta2 ** state["step"])
+                    # Apply bias to lr to avoid broadcast.
+                else:
+                    exp_avg_hat = exp_avg
+                    exp_avg_sq_hat = exp_avg_sq
+
+                adam_step = exp_avg_hat / exp_avg_sq_hat.sqrt().add(group["eps"])
+                trust_ratio = 1
+                if group["weight_decay"] != 0:
+                    adam_step.add_(data, alpha=group["weight_decay"])
+
+                    weight_norm = data.pow(2).sum().sqrt()  # .clamp(0, 10)
+                    adam_norm = adam_step.pow(2).sum().sqrt()
+                    if weight_norm == 0 or adam_norm == 0:
+                        trust_ratio = 1
+                    else:
+                        trust_ratio = weight_norm / adam_norm
+                    if self.adam:
+                        trust_ratio = 1
+                    state["weight_norm"] = weight_norm
+                    state["adam_norm"] = adam_norm
+                    state["trust_ratio"] = trust_ratio
+
+                data.add_(adam_step, alpha=-step_size * trust_ratio)
+                if bf16_param:
+                    p.data = data.to(torch.bfloat16)
+
+        return loss
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/reference.py b/models/language_modeling/pytorch/bert_large/training/gpu/reference.py
new file mode 100644
index 000000000..0771b27e1
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/reference.py
@@ -0,0 +1,1692 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
+on a text file or a dataset without using HuggingFace Trainer.
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=masked-lm
+"""
+# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
+
+"""BERT Pretraining"""
+
+import argparse
+import csv
+import h5py
+import os
+import glob
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
+from torch.utils.data.distributed import DistributedSampler
+import logging
+import math
+import multiprocessing
+import random
+import re
+import time
+import sys
+
+from collections import OrderedDict
+from concurrent.futures import ProcessPoolExecutor
+
+from schedulers import LinearWarmupPolyDecayScheduler
+
+import utils
+
+import torch.nn.functional as F
+import argparse
+
+
+import transformers
+import modeling_bert_patched
+
+# from accelerate import Accelerator, DistributedType
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AutoConfig,
+    AutoModelForPreTraining,
+    SchedulerType,
+    get_scheduler,
+    set_seed,
+)
+
+from pcl_pytorch_extension import bert as pcl_bert
+from schedulers import LinearWarmUpScheduler, LinearWarmupPolyDecayScheduler
+import mlperf_logger
+from lamb import Lamb
+
+from pcl_pytorch_extension.optim import DistLamb
+
+ref_time = 0
+
+
+def get_time():
+    global ref_time
+    t = time.time()
+    return (t - ref_time) * 1000.0
+
+
+def sync_clock():
+    global ref_time
+    t = time.time()
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+        t = time.time()
+        t = torch.tensor([t], dtype=torch.double)
+        torch.distributed.broadcast(t, 0)
+        torch.distributed.barrier()
+        t = t.item()
+    ref_time = t
+
+
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+class WorkerInitObj(object):
+    def __init__(self, seed):
+        self.seed = seed
+
+    def __call__(self, id):
+        np.random.seed(seed=self.seed + id)
+        random.seed(self.seed + id)
+
+
+def get_eval_batchsize_per_worker(args):
+    if torch.distributed.is_initialized():
+        chunk_size = args.num_eval_examples // args.world_size
+        rank = args.local_rank
+        remainder = args.num_eval_examples % args.world_size
+        if rank < remainder:
+            return chunk_size + 1
+        else:
+            return chunk_size
+
+
+def create_pretraining_dataset(
+    input_file, max_pred_length, shared_list, args, worker_init_fn
+):
+    train_data = pretraining_dataset(
+        input_file=input_file, max_pred_length=max_pred_length
+    )
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(
+        train_data, sampler=train_sampler, batch_size=args.train_batch_size
+    )
+
+    return train_dataloader, input_file
+
+
+def create_eval_dataset(args, worker_init_fn):
+    eval_data = []
+    for eval_file in sorted(os.listdir(args.eval_dir)):
+        eval_file_path = os.path.join(args.eval_dir, eval_file)
+
+        if os.path.isfile(eval_file_path) and "part" in eval_file_path:
+            eval_data.extend(
+                pretraining_dataset(
+                    eval_file_path, max_pred_length=args.max_predictions_per_seq
+                )
+            )
+            if len(eval_data) > args.num_eval_examples:
+                eval_data = eval_data[: args.num_eval_examples]
+                break
+    if torch.distributed.is_initialized():
+        chunk_size = args.num_eval_examples // args.world_size
+        rank = args.local_rank
+        remainder = args.num_eval_examples % args.world_size
+        if rank < remainder:
+            eval_data = eval_data[
+                (chunk_size + 1) * rank : (chunk_size + 1) * (rank + 1)
+            ]
+        else:
+            eval_data = eval_data[
+                chunk_size * rank + remainder : chunk_size * (rank + 1) + remainder
+            ]
+
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(
+        eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=0
+    )
+    return eval_dataloader
+
+
+class pretraining_dataset(Dataset):
+    def __init__(self, input_file, max_pred_length):
+        self.input_file = input_file
+        self.max_pred_length = max_pred_length
+        f = h5py.File(input_file, "r")
+        keys = [
+            "input_ids",
+            "input_mask",
+            "segment_ids",
+            "masked_lm_positions",
+            "masked_lm_ids",
+            "next_sentence_labels",
+        ]
+        self.inputs = [np.asarray(f[key][:]) for key in keys]
+        print(f"Loaded {len(self.inputs[0]):d} samples from datafile: {input_file}")
+        f.close()
+
+    def __len__(self):
+        "Denotes the total number of samples"
+        return len(self.inputs[0])
+
+    def __getitem__(self, index):
+        [
+            input_ids,
+            input_mask,
+            segment_ids,
+            masked_lm_positions,
+            masked_lm_ids,
+            next_sentence_labels,
+        ] = [
+            torch.from_numpy(input[index].astype(np.int64))
+            if indice < 5
+            else torch.from_numpy(np.asarray(input[index].astype(np.int64)))
+            for indice, input in enumerate(self.inputs)
+        ]
+        masked_lm_labels = torch.zeros(input_ids.shape, dtype=torch.long) - 100
+        index = self.max_pred_length
+        masked_token_count = torch.count_nonzero(masked_lm_positions)
+        if masked_token_count != 0:
+            index = masked_token_count
+        masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
+
+        return [
+            input_ids,
+            segment_ids,
+            input_mask,
+            masked_lm_labels,
+            next_sentence_labels,
+        ]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Finetune a transformers model on a Masked Language Modeling task"
+    )
+
+    ## Required parameters
+    parser.add_argument(
+        "--input_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input data dir. Should contain .hdf5 files  for the task.",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model checkpoints will be written.",
+    )
+    parser.add_argument(
+        "--eval_dir",
+        default=None,
+        type=str,
+        help="The eval data dir. Should contain .hdf5 files  for the task.",
+    )
+    parser.add_argument(
+        "--eval_iter_start_samples",
+        default=3000000,
+        type=int,
+        help="Sample to begin performing eval.",
+    )
+    parser.add_argument(
+        "--eval_iter_samples",
+        default=-1,
+        type=int,
+        help="If set to -1, disable eval, \
+                        else evaluate every eval_iter_samples during training",
+    )
+    parser.add_argument(
+        "--num_eval_examples",
+        default=10000,
+        type=int,
+        help="number of eval examples to run eval on",
+    )
+    parser.add_argument(
+        "--init_checkpoint",
+        default=None,
+        type=str,
+        help="The initial checkpoint to start training from.",
+    )
+    parser.add_argument(
+        "--init_tf_checkpoint",
+        default=None,
+        type=str,
+        help="The initial TF checkpoint to start training from.",
+    )
+    parser.add_argument(
+        "--train_file",
+        type=str,
+        default=None,
+        help="A csv or a json file containing the training data.",
+    )
+    parser.add_argument(
+        "--validation_file",
+        type=str,
+        default=None,
+        help="A csv or a json file containing the validation data.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+        required=True,
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--per_device_train_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the training dataloader.",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument(
+        "--max_predictions_per_seq",
+        default=76,
+        type=int,
+        help="The maximum total of masked tokens in input sequence",
+    )
+    parser.add_argument(
+        "--train_batch_size",
+        default=18,
+        type=int,
+        help="Total batch size for training.",
+    )
+    parser.add_argument(
+        "--eval_batch_size",
+        default=128,
+        type=int,
+        help="Total batch size for training.",
+    )
+    parser.add_argument(
+        "--weight_decay_rate",
+        default=0.01,
+        type=float,
+        help="weight decay rate for LAMB.",
+    )
+    parser.add_argument(
+        "--opt_lamb_beta_1", default=0.9, type=float, help="LAMB beta1."
+    )
+    parser.add_argument(
+        "--opt_lamb_beta_2", default=0.999, type=float, help="LAMB beta2."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=1536,
+        type=float,
+        help="Total number of training steps to perform.",
+    )
+    parser.add_argument(
+        "--max_samples_termination",
+        default=14000000,
+        type=float,
+        help="Total number of training samples to run.",
+    )
+    parser.add_argument(
+        "--warmup_proportion",
+        default=0.01,
+        type=float,
+        help="Proportion of optimizer update steps to perform linear learning rate warmup for. "
+        "Typically 1/8th of steps for Phase2",
+    )
+    parser.add_argument(
+        "--warmup_steps",
+        default=0,
+        type=float,
+        help="Number of optimizer update steps to perform linear learning rate warmup for. "
+        "Typically 1/8th of steps for Phase2",
+    )
+    parser.add_argument(
+        "--start_warmup_step", default=0, type=float, help="Starting step for warmup. "
+    )
+    parser.add_argument(
+        "--log_freq",
+        type=float,
+        default=10000.0,
+        help="frequency of logging loss. If not positive, no logging is provided for training loss",
+    )
+    parser.add_argument(
+        "--checkpoint_activations",
+        default=False,
+        action="store_true",
+        help="Whether to use gradient checkpointing",
+    )
+    parser.add_argument(
+        "--resume_from_checkpoint",
+        default=False,
+        action="store_true",
+        help="Whether to resume training from checkpoint. If set, precedes init_checkpoint/init_tf_checkpoint",
+    )
+    parser.add_argument(
+        "--keep_n_most_recent_checkpoints",
+        type=int,
+        default=20,
+        help="Number of checkpoints to keep (rolling basis).",
+    )
+    parser.add_argument(
+        "--num_samples_per_checkpoint",
+        type=int,
+        default=500000,
+        help="Number of update steps until a model checkpoint is saved to disk.",
+    )
+    parser.add_argument(
+        "--min_samples_to_start_checkpoints",
+        type=int,
+        default=3000000,
+        help="Number of update steps until model checkpoints start saving to disk.",
+    )
+    parser.add_argument(
+        "--skip_checkpoint",
+        default=False,
+        action="store_true",
+        help="Whether to save checkpoints",
+    )
+    parser.add_argument(
+        "--phase2",
+        default=False,
+        action="store_true",
+        help="Only required for checkpoint saving format",
+    )
+    parser.add_argument(
+        "--do_train",
+        default=False,
+        action="store_true",
+        help="Whether to run training.",
+    )
+    parser.add_argument(
+        "--bert_config_path",
+        type=str,
+        default="/workspace/phase1",
+        help="Path bert_config.json is located in",
+    )
+    parser.add_argument(
+        "--target_mlm_accuracy",
+        type=float,
+        default=0.72,
+        help="Stop training after reaching this Masked-LM accuracy",
+    )
+    parser.add_argument(
+        "--train_mlm_accuracy_window_size",
+        type=int,
+        default=0,
+        help="Average accuracy over this amount of batches before performing a stopping criterion test",
+    )
+    parser.add_argument(
+        "--num_epochs_to_generate_seeds_for",
+        type=int,
+        default=2,
+        help="Number of epochs to plan seeds for. Same set across all workers.",
+    )
+    parser.add_argument(
+        "--use_gradient_as_bucket_view",
+        default=False,
+        action="store_true",
+        help="Turn ON gradient_as_bucket_view optimization in native DDP.",
+    )
+    parser.add_argument(
+        "--dense_seq_output",
+        default=False,
+        action="store_true",
+        help="Whether to run with optimizations.",
+    )
+    parser.add_argument(
+        "--no_ddp", default=False, action="store_true", help="Whether to use DDP."
+    )
+    parser.add_argument(
+        "--dist_lamb",
+        default=False,
+        action="store_true",
+        help="Whether to use DistLamb from pcl_bert",
+    )
+
+    parser.add_argument(
+        "--weight_decay", type=float, default=0.0, help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        type=int,
+        default=3,
+        help="Total number of training epochs to perform.",
+    )
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=[
+            "linear",
+            "cosine",
+            "cosine_with_restarts",
+            "polynomial",
+            "constant",
+            "constant_with_warmup",
+        ],
+    )
+    parser.add_argument(
+        "--seed", type=int, default=42, help="A seed for reproducible training."
+    )
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument(
+        "--local_rank", default=0, type=int, help="Total batch size for training."
+    )
+    parser.add_argument(
+        "--world_size", default=1, type=int, help="Total batch size for training."
+    )
+
+    parser.add_argument(
+        "--use_pcl",
+        action="store_true",
+        help="Whether to use PCL Fused impl when available",
+    )
+    parser.add_argument(
+        "--pcl_bf16", action="store_true", help="Whether to use PCL BF16 impl"
+    )
+    parser.add_argument(
+        "--profile", action="store_true", help="Whether to enable profiling"
+    )
+
+    parser.add_argument(
+        "--skip_exchange_padding",
+        default=False,
+        action="store_true",
+        help="Whether to skip padding exchange.",
+    )
+    parser.add_argument(
+        "--unpad",
+        default=False,
+        action="store_true",
+        help="Whether to run with unpadding.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=0,
+        help="Number of steps to run for benchmark.",
+    )
+    parser.add_argument(
+        "--dist_backend",
+        type=str,
+        default="ccl",
+        help="Specify distributed backend to use.",
+    )
+    parser.add_argument(
+        "--multi_instance",
+        action="store_true",
+        help="Skip torch.distributed initialization to Run multiple instance independently",
+    )
+    parser.add_argument(
+        "--dist_profile",
+        action="store_true",
+        help="Whether to enable distributed timing profile",
+    )
+
+    args = parser.parse_args()
+
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+    # assert args.init_checkpoint is not None or args.init_tf_checkpoint is not None or found_resume_checkpoint(args), \
+    #    "Must specify --init_checkpoint, --init_tf_checkpoint or have ckpt to resume from in --output_dir of the form *.pt"
+
+    # assert not (args.init_checkpoint is not None and args.init_tf_checkpoint is not None), \
+    #        "Can only specify one of --init_checkpoint and --init_tf_checkpoint"
+    return args
+
+
+def found_resume_checkpoint(args):
+    if args.phase2:
+        checkpoint_str = "phase2_ckpt*.pt"
+    else:
+        checkpoint_str = "phase1_ckpt*.pt"
+    return (
+        args.resume_from_checkpoint
+        and len(glob.glob(os.path.join(args.output_dir, checkpoint_str))) > 0
+    )
+
+
+def setup_training(args):
+    device = torch.device("cpu")
+    if int(os.environ.get("PMI_SIZE", "0")) > 1 and not args.multi_instance:
+        if args.dist_backend == "ccl":
+            try:
+                import torch_ccl
+            except:
+                print("CCL backend requested but import torch_ccl failed")
+                raise
+        elif args.dist_backend == "mpi":
+            if not torch.distributed.is_mpi_available():
+                try:
+                    import torch_mpi
+                except:
+                    print(
+                        "MPI backend requested but not available try installing torch_mpi module"
+                    )
+                    raise
+        else:
+            raise ValueError(f"{args.dist_backend} backend requested but not supported")
+
+        os.environ["RANK"] = os.environ.get("PMI_RANK", "0")
+        os.environ["WORLD_SIZE"] = os.environ.get("PMI_SIZE", "1")
+        torch.distributed.init_process_group(backend=args.dist_backend)
+        device = torch.device("cpu")
+        args.local_rank = torch.distributed.get_rank()
+        args.world_size = torch.distributed.get_world_size()
+        if args.local_rank == 0:
+            print(
+                f"##################Using {args.dist_backend.upper()} dist run with {args.world_size} ranks",
+                flush=True,
+            )
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError(
+            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+                args.gradient_accumulation_steps
+            )
+        )
+    if args.train_batch_size % args.gradient_accumulation_steps != 0:
+        raise ValueError(
+            "Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
+                args.gradient_accumulation_steps, args.train_batch_size
+            )
+        )
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    if not (args.do_train or (args.eval_dir and args.eval_iter_samples <= 0)):
+        raise ValueError(" `do_train`  or should be in offline eval mode")
+
+    if not args.resume_from_checkpoint or not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir, exist_ok=True)
+    return device, args
+
+
+def prepare_model_and_optimizer(args, device):
+    global_step = 0
+    args.resume_step = 0
+    checkpoint = None
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.config_name)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    config.dense_seq_output = args.dense_seq_output
+    with pcl_bert.pcl_impl(args.use_pcl, args.pcl_bf16, args.unpad):
+        if args.model_name_or_path:
+            model = AutoModelForPreTraining.from_pretrained(
+                args.model_name_or_path,
+                from_tf=bool(".ckpt" in args.model_name_or_path),
+                config=config,
+            )
+        else:
+            logger.info("Training new model from scratch")
+            model = AutoModelForPreTraining.from_config(config)
+    if args.use_pcl:
+        pcl_bert.block(model)
+
+    param_optimizer = list(model.named_parameters())
+
+    no_decay = ["bias", "gamma", "beta", "LayerNorm"]
+
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": args.weight_decay_rate,
+        },
+        {
+            "params": [
+                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
+    ]
+
+    print(
+        "GroupSizes: ",
+        [sum([p.numel() for p in g["params"]]) for g in optimizer_grouped_parameters],
+    )
+    if args.dist_lamb:
+        optimizer = DistLamb(
+            optimizer_grouped_parameters,
+            lr=args.learning_rate,
+            betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2),
+        )
+    else:
+        optimizer = Lamb(
+            optimizer_grouped_parameters,
+            lr=args.learning_rate,
+            betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2),
+        )
+
+    mlperf_logger.log_event(
+        key=mlperf_logger.constants.OPT_BASE_LR, value=args.learning_rate, sync=False
+    )
+    mlperf_logger.log_event(
+        key="opt_epsilon", value=optimizer.defaults["eps"], sync=False
+    )
+    b1, b2 = optimizer.defaults["betas"]
+    mlperf_logger.log_event(key="opt_lamb_beta_1", value=b1, sync=False)
+    mlperf_logger.log_event(key="opt_lamb_beta_2", value=b2, sync=False)
+    mlperf_logger.log_event(
+        key="opt_lamb_weight_decay_rate",
+        value=optimizer.defaults["weight_decay"],
+        sync=False,
+    )
+
+    if args.warmup_steps == 0:
+        warmup_steps = int(args.max_steps * args.warmup_proportion)
+        warmup_start = 0
+    else:
+        warmup_steps = args.warmup_steps
+        warmup_start = args.start_warmup_step
+
+    lr_scheduler = LinearWarmupPolyDecayScheduler(
+        optimizer,
+        start_warmup_steps=warmup_start,
+        warmup_steps=warmup_steps,
+        total_steps=args.max_steps,
+        end_learning_rate=0.0,
+        degree=1.0,
+    )
+    return model, optimizer, lr_scheduler, checkpoint, global_step
+
+
+def take_optimizer_step(args, optimizer, model, overflow_buf, global_step):
+    global skipped_steps
+    optimizer.step()
+    global_step += 1
+    return global_step
+
+
+def run_eval(
+    model,
+    eval_dataloader,
+    device,
+    num_eval_examples,
+    args,
+    first_eval=False,
+    use_cache=False,
+):
+    model.eval()
+    total_eval_loss, total_eval_mlm_acc = 0.0, 0.0
+    total_masked = 0
+    with torch.no_grad():
+        for batch in eval_dataloader:
+            (
+                input_ids,
+                segment_ids,
+                input_mask,
+                masked_lm_labels,
+                next_sentence_labels,
+            ) = batch
+            outputs = model(
+                input_ids=input_ids,
+                token_type_ids=segment_ids,
+                attention_mask=input_mask,
+                labels=masked_lm_labels,
+                next_sentence_label=next_sentence_labels,
+            )
+            mlm_acc, num_masked = calc_mlm_acc(
+                outputs, masked_lm_labels, args.dense_seq_output
+            )
+            total_eval_loss += outputs.loss.item() * num_masked
+            total_eval_mlm_acc += mlm_acc * num_masked
+            total_masked += num_masked
+    model.train()
+    total_masked = torch.tensor(total_masked, device=device, dtype=torch.int64)
+    total_eval_loss = torch.tensor(total_eval_loss, device=device, dtype=torch.float64)
+    if torch.distributed.is_initialized():
+        # Collect total scores from all ranks
+        torch.distributed.all_reduce(
+            total_eval_mlm_acc, op=torch.distributed.ReduceOp.SUM
+        )
+        torch.distributed.all_reduce(total_eval_loss, op=torch.distributed.ReduceOp.SUM)
+        torch.distributed.all_reduce(total_masked, op=torch.distributed.ReduceOp.SUM)
+
+    # Average by number of examples
+    total_eval_mlm_acc /= total_masked
+    total_eval_loss /= total_masked
+
+    return total_eval_loss, total_eval_mlm_acc
+
+
+def global_batch_size(args):
+    return args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+
+
+def calc_mlm_acc(outputs, masked_lm_labels, dense_seq_output=False):
+    prediction_scores = outputs.prediction_logits
+    masked_lm_labels_flat = masked_lm_labels.view(-1)
+    mlm_labels = masked_lm_labels_flat[masked_lm_labels_flat != -100]
+    if not dense_seq_output:
+        prediction_scores_flat = prediction_scores.view(-1, prediction_scores.shape[-1])
+        mlm_predictions_scores = prediction_scores_flat[masked_lm_labels_flat != -100]
+        mlm_predictions = mlm_predictions_scores.argmax(dim=-1)
+    else:
+        mlm_predictions = prediction_scores.argmax(dim=-1)
+
+    num_masked = mlm_labels.numel()
+    mlm_acc = (mlm_predictions == mlm_labels).sum(dtype=torch.float) / num_masked
+
+    return mlm_acc, num_masked
+
+
+def calc_accuracy(outputs, masked_lm_labels, next_sentence_label, args):
+    loss = outputs.loss.item()
+    prediction_logits = outputs.prediction_logits
+    seq_relationship_logits = outputs.seq_relationship_logits
+    mlm_acc, num_masked = calc_mlm_acc(outputs, masked_lm_labels, args.dense_seq_output)
+    seq_acc_t = (
+        torch.argmax(seq_relationship_logits, dim=-1)
+        .eq(next_sentence_label.view([-1]))
+        .to(torch.float)
+    )
+    seq_acc_true, seq_tot = seq_acc_t.sum().item(), seq_acc_t.numel()
+    seq_acc = seq_acc_true / seq_tot
+    return loss, mlm_acc, num_masked, seq_acc, seq_tot
+
+
+def exchange_padding_fast(
+    input_ids,
+    segment_ids,
+    input_mask,
+    masked_lm_labels,
+    next_sentence_labels,
+    max_batch_size,
+):
+    device = "cpu"
+    comm_dtype = torch.int32
+    pad_size = max_batch_size - input_ids.shape[0]
+    if pad_size > 0:
+        input_ids = F.pad(input_ids, (0, 0, 0, pad_size))
+        segment_ids = F.pad(segment_ids, (0, 0, 0, pad_size))
+        input_mask = F.pad(input_mask, (0, 0, 0, pad_size))
+        masked_lm_labels = F.pad(masked_lm_labels, (0, 0, 0, pad_size))
+        next_sentence_labels = F.pad(next_sentence_labels, (0, pad_size))
+    nranks = torch.distributed.get_world_size()
+    nseqs = input_mask.shape[0]
+    ntokensperseq = input_mask.shape[1]
+    my_rank = torch.distributed.get_rank()
+
+    flattened_length_seq = nseqs * ntokensperseq
+    flattened_length_nsp = nseqs
+
+    def get_local_packet_size():
+        return 4 * flattened_length_seq + flattened_length_nsp
+
+    # Storing tensors in same order as arguments
+    def encode_packet(
+        input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels
+    ):
+
+        packet = torch.zeros([get_local_packet_size()], device=device, dtype=comm_dtype)
+
+        curr_pos = 0
+
+        packet[curr_pos : curr_pos + flattened_length_seq] = input_ids.view(-1)[:]
+        curr_pos += flattened_length_seq
+
+        packet[curr_pos : curr_pos + flattened_length_seq] = segment_ids.view(-1)[:]
+        curr_pos += flattened_length_seq
+
+        packet[curr_pos : curr_pos + flattened_length_seq] = input_mask.view(-1)[:]
+        curr_pos += flattened_length_seq
+
+        packet[curr_pos : curr_pos + flattened_length_seq] = masked_lm_labels.view(-1)[
+            :
+        ]
+        curr_pos += flattened_length_seq
+
+        packet[curr_pos : curr_pos + flattened_length_nsp] = next_sentence_labels.view(
+            -1
+        )[:]
+
+        return packet
+
+    def decode_packet(flat_packet):
+        packet = flat_packet.view(nranks, get_local_packet_size())
+
+        curr_pos = 0
+
+        input_ids_ = (
+            packet[:, curr_pos : curr_pos + flattened_length_seq]
+            .contiguous()
+            .view(nranks, nseqs, ntokensperseq)
+        )
+        curr_pos += flattened_length_seq
+
+        segment_ids_ = (
+            packet[:, curr_pos : curr_pos + flattened_length_seq]
+            .contiguous()
+            .view(nranks, nseqs, ntokensperseq)
+        )
+        curr_pos += flattened_length_seq
+
+        input_mask_ = (
+            packet[:, curr_pos : curr_pos + flattened_length_seq]
+            .contiguous()
+            .view(nranks, nseqs, ntokensperseq)
+        )
+        curr_pos += flattened_length_seq
+
+        masked_lm_labels_ = (
+            packet[:, curr_pos : curr_pos + flattened_length_seq]
+            .contiguous()
+            .view(nranks, nseqs, ntokensperseq)
+        )
+        curr_pos += flattened_length_seq
+
+        next_sentence_labels_ = (
+            packet[:, curr_pos : curr_pos + flattened_length_nsp]
+            .contiguous()
+            .view(nranks, nseqs)
+        )
+
+        return (
+            input_ids_,
+            segment_ids_,
+            input_mask_,
+            masked_lm_labels_,
+            next_sentence_labels_,
+        )
+
+    tensors = encode_packet(
+        input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels
+    )
+
+    tensors_ = torch.zeros(
+        [nranks, get_local_packet_size()], device=device, dtype=comm_dtype
+    )
+    tensors_ = list(torch.split(tensors_, 1))
+
+    # torch.distributed.all_gather(tensors_, tensors.view(torch.float16))
+    torch.distributed.all_gather(tensors_, tensors)
+
+    tensors_ = torch.stack(tensors_).view(comm_dtype).long()
+    (
+        input_ids_,
+        segment_ids_,
+        input_mask_,
+        masked_lm_labels_,
+        next_sentence_labels_,
+    ) = decode_packet(tensors_)
+
+    seqlens_, indices = torch.sort(input_mask_.sum(dim=2).view(-1), descending=True)
+
+    if pad_size > 0:
+        input_ids_sorted = input_ids_.view(nranks * nseqs, ntokensperseq)[indices[:], :]
+        segment_ids_sorted = segment_ids_.view(nranks * nseqs, ntokensperseq)[
+            indices[:], :
+        ]
+        input_mask_sorted = input_mask_.view(nranks * nseqs, ntokensperseq)[
+            indices[:], :
+        ]
+        masked_lm_labels_sorted = masked_lm_labels_.view(nranks * nseqs, ntokensperseq)[
+            indices[:], :
+        ]
+        next_sentence_labels_sorted = next_sentence_labels_.view(nranks * nseqs)[
+            indices[:]
+        ]
+        # we need to remove the empty sequences we added to the batch
+        valid_idx = seqlens_.view(nseqs, nranks)[:, my_rank] > 0
+        input_ids_sorted = input_ids_sorted.view(nseqs, nranks, ntokensperseq)[
+            valid_idx, my_rank, :
+        ].contiguous()
+        segment_ids_sorted = segment_ids_sorted.view(nseqs, nranks, ntokensperseq)[
+            valid_idx, my_rank, :
+        ].contiguous()
+        input_mask_sorted = input_mask_sorted.view(nseqs, nranks, ntokensperseq)[
+            valid_idx, my_rank, :
+        ].contiguous()
+        masked_lm_labels_sorted = masked_lm_labels_sorted.view(
+            nseqs, nranks, ntokensperseq
+        )[valid_idx, my_rank, :].contiguous()
+        next_sentence_labels_sorted = next_sentence_labels_sorted.view(nseqs, nranks)[
+            valid_idx, my_rank
+        ].contiguous()
+    else:
+        indices_ = indices.view(nseqs, nranks)[:, my_rank]
+        input_ids_sorted = input_ids_.view(nseqs * nranks, ntokensperseq)[
+            indices_, :
+        ].contiguous()
+        segment_ids_sorted = segment_ids_.view(nseqs * nranks, ntokensperseq)[
+            indices_, :
+        ].contiguous()
+        input_mask_sorted = input_mask_.view(nseqs * nranks, ntokensperseq)[
+            indices_, :
+        ].contiguous()
+        masked_lm_labels_sorted = masked_lm_labels_.view(nseqs * nranks, ntokensperseq)[
+            indices_, :
+        ].contiguous()
+        next_sentence_labels_sorted = next_sentence_labels_.view(nseqs * nranks)[
+            indices_
+        ].contiguous()
+
+    return (
+        input_ids_sorted,
+        segment_ids_sorted,
+        input_mask_sorted,
+        masked_lm_labels_sorted,
+        next_sentence_labels_sorted,
+    )
+
+
+def main():
+    args = parse_args()
+    status = "aborted"  # later set to 'success' if termination criteria met
+    mlperf_logger.log_start(
+        key=mlperf_logger.constants.INIT_START, log_all_ranks=True, sync=False
+    )
+    device, args = setup_training(args)
+    print("####################args.world_size={}".format(args.world_size))
+    total_batch_size = global_batch_size(args)
+    mlperf_logger.mlperf_submission_log("bert")
+
+    mlperf_logger.log_event(
+        key=mlperf_logger.constants.SEED, value=args.seed, sync=False
+    )
+    mlperf_logger.log_event(
+        key=mlperf_logger.constants.GLOBAL_BATCH_SIZE,
+        value=global_batch_size(args),
+        sync=False,
+    )
+    mlperf_logger.log_event(key="d_batch_size", value=args.train_batch_size, sync=False)
+    mlperf_logger.log_event(
+        key=mlperf_logger.constants.GRADIENT_ACCUMULATION_STEPS,
+        value=args.gradient_accumulation_steps,
+        sync=False,
+    )
+    mlperf_logger.log_event(
+        key="max_predictions_per_seq", value=args.max_predictions_per_seq, sync=False
+    )
+    mlperf_logger.log_event(
+        key="opt_learning_rate_training_steps", value=args.max_steps, sync=False
+    )
+    mlperf_logger.log_event(
+        key="num_warmup_steps",
+        value=int(args.warmup_proportion * args.max_steps)
+        if args.warmup_steps == 0
+        else args.warmup_steps,
+        sync=False,
+    )
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    if args.local_rank == 0 or args.local_rank == -1:
+        print("parsed args:")
+        print(args)
+    # Prepare optimizer
+    (
+        model,
+        optimizer,
+        lr_scheduler,
+        checkpoint,
+        global_step,
+    ) = prepare_model_and_optimizer(args, device)
+    worker_seeds, shuffling_seeds = utils.setup_seeds(
+        args.seed, args.num_epochs_to_generate_seeds_for, device
+    )
+    worker_seed = worker_seeds[args.local_rank]
+
+    random.seed(worker_seed)
+    np.random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
+    worker_init = WorkerInitObj(worker_seed)
+    samples_trained = (
+        global_step
+        * args.train_batch_size
+        * args.gradient_accumulation_steps
+        * args.world_size
+    )
+    final_loss = float("inf")
+    train_time_raw = float("inf")
+    raw_train_start = time.time()
+    if args.do_train:
+        model.train()
+        most_recent_ckpts_paths = []
+        average_loss = 0.0  # averaged loss every args.log_freq steps
+        epoch = 1
+        training_steps = 0
+        end_training, converged = False, False
+        samples_trained_prev = 0
+
+        # pre-compute eval boundaries
+        samples_trained_per_step = (
+            args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+        )
+        start, stop, step = (
+            args.eval_iter_start_samples,
+            args.max_samples_termination,
+            args.eval_iter_samples,
+        )
+        eval_steps = [
+            math.ceil(i / samples_trained_per_step)
+            for i in np.arange(start, stop, step)
+        ]
+        eval_count = 0
+        next_eval_step = eval_steps[eval_count]
+        # pool = ProcessPoolExecutor(1)
+
+        if args.target_mlm_accuracy:
+            if args.train_mlm_accuracy_window_size > 0:
+                accuracy_scores = []
+                avg_mlm_accuracy = torch.Tensor([0])
+
+        first_epoch = True
+        if found_resume_checkpoint(args):
+            f_start_id = checkpoint["files"][0]
+            files = checkpoint["files"][1:]
+            num_files = len(files)
+        else:
+            files = [
+                os.path.join(args.input_dir, f)
+                for f in os.listdir(args.input_dir)
+                if os.path.isfile(os.path.join(args.input_dir, f)) and "part" in f
+            ]
+            files.sort()
+            num_files = len(files)
+            random.Random(shuffling_seeds[epoch]).shuffle(files)
+            f_start_id = 0
+    global skipped_steps
+    if torch.distributed.is_initialized() and not args.no_ddp:
+        model = torch.nn.parallel.DistributedDataParallel(
+            model,
+            # bucket_cap_mb=8192,
+            gradient_as_bucket_view=args.use_gradient_as_bucket_view,
+        )
+
+    mlperf_logger.log_end(key=mlperf_logger.constants.INIT_STOP, sync=False)
+    mlperf_logger.log_start(key=mlperf_logger.constants.RUN_START, sync=True)
+    mlperf_logger.barrier()
+
+    now_step, now_skipped, skip_interval = 0, 0, 0
+    # Start prefetching eval dataset
+    # if args.eval_dir:
+    #    eval_dataset_future = pool.submit(create_eval_dataset, args, worker_init_fn=worker_init)
+    # comparing to number of samples in a shard. There are ~38k samples in 4096-way shard, comparing to 10k to be safe
+    need_next_training_shard = (
+        args.train_batch_size * args.gradient_accumulation_steps * args.max_steps
+        > 10000
+    )
+
+    sync_clock()
+
+    while global_step < args.max_steps and not end_training:
+        mlperf_logger.log_start(
+            key=mlperf_logger.constants.EPOCH_START,
+            metadata={"epoch_num": epoch},
+            sync=False,
+        )
+        mlperf_logger.log_start(
+            key=mlperf_logger.constants.BLOCK_START,
+            metadata={"first_epoch_num": epoch, "epoch_count": 1},
+            sync=False,
+        )
+        if args.local_rank == 0 or args.local_rank == -1:
+            now_time = time.time()
+            print("epoch:", epoch)
+
+        thread = None
+
+        # Reshuffle file list on subsequent epochs
+        if not first_epoch:
+            files = [
+                os.path.join(args.input_dir, f)
+                for f in os.listdir(args.input_dir)
+                if os.path.isfile(os.path.join(args.input_dir, f)) and "part" in f
+            ]
+            files.sort()
+            num_files = len(files)
+            random.Random(shuffling_seeds[epoch]).shuffle(files)
+            f_start_id = 0
+
+        first_epoch = False
+
+        shared_file_list = {}
+
+        if torch.distributed.is_initialized() and args.world_size > num_files:
+            remainder = args.world_size % num_files
+            data_file = files[
+                (
+                    f_start_id * args.world_size
+                    + args.local_rank
+                    + remainder * f_start_id
+                )
+                % num_files
+            ]
+        else:
+            data_file = files[
+                (f_start_id * args.world_size + args.local_rank) % num_files
+            ]
+
+        # data_file = '/work/ddkalamk/bert/dataset/2048_shards_uncompressed/part_453_of_2048.hdf5' #files[0]
+        previous_file = data_file
+
+        train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
+        train_sampler = RandomSampler(train_data)
+        # train_sampler = SequentialSampler(train_data)
+        train_dataloader = DataLoader(
+            train_data, sampler=train_sampler, batch_size=args.train_batch_size
+        )
+        send_lr_in_parallel = False
+        lr_cpu = torch.tensor([0.0], dtype=torch.float32, device="cpu")
+        for f_id in range(f_start_id + 1, len(files)):
+            if args.world_size > num_files:
+                data_file = files[
+                    (f_id * args.world_size + args.local_rank + remainder * f_id)
+                    % num_files
+                ]
+            else:
+                data_file = files[
+                    (f_id * args.world_size + args.local_rank) % num_files
+                ]
+
+            previous_file = data_file
+            if need_next_training_shard:
+                # dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init_fn=worker_init)
+                pass
+            t0 = get_time()
+            for step, batch in enumerate(train_dataloader):
+                training_steps += 1
+                if args.profile and args.use_pcl:
+                    pcl_bert.reset_debug_timers()
+                t1 = get_time()
+                (
+                    input_ids,
+                    segment_ids,
+                    input_mask,
+                    masked_lm_labels,
+                    next_sentence_labels,
+                ) = batch
+                if (
+                    args.world_size > 1
+                    and args.unpad
+                    and not args.skip_exchange_padding
+                ):
+                    (
+                        input_ids,
+                        segment_ids,
+                        input_mask,
+                        masked_lm_labels,
+                        next_sentence_labels,
+                    ) = exchange_padding_fast(
+                        input_ids,
+                        segment_ids,
+                        input_mask,
+                        masked_lm_labels,
+                        next_sentence_labels,
+                        args.train_batch_size,
+                    )
+                # print(f"Input shape: {batch['input_ids'].shape}")
+                t2 = get_time()
+                outputs = model(
+                    input_ids=input_ids,
+                    token_type_ids=segment_ids,
+                    attention_mask=input_mask,
+                    labels=masked_lm_labels,
+                    next_sentence_label=next_sentence_labels,
+                )
+                t3 = get_time()
+                loss = outputs.loss
+                # loss = loss / args.gradient_accumulation_steps
+                if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(
+                    train_dataloader
+                ) - 1:
+                    loss.backward()
+                    t4 = get_time()
+                    if hasattr(optimizer, "clip_grad_norm_"):
+                        ggnorm = optimizer.clip_grad_norm_(1.0)
+                    else:
+                        ggnorm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                    t40 = get_time()
+                    mlperf_logger.barrier()
+                    t41 = get_time()
+                    if args.no_ddp:
+                        optimizer.sync_grads()
+                    t42 = get_time()
+                    optimizer.step()
+                    lr_scheduler.step()
+                    optimizer.zero_grad()
+                    # progress_bar.update(1)
+                else:
+                    # with model.no_sync():
+                    loss.backward()
+                    t4 = get_time()
+                    if hasattr(optimizer, "clip_grad_norm_"):
+                        ggnorm = optimizer.clip_grad_norm_(1.0)
+                    else:
+                        ggnorm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                    t40 = get_time()
+                    mlperf_logger.barrier()
+                    t42 = t41 = get_time()
+                    optimizer.acc_and_zero_grad()
+                # mlperf_logger.barrier()
+                t5 = get_time()
+                gloss, lm_acc, num_masked, seq_acc, seq_tot = calc_accuracy(
+                    outputs, masked_lm_labels, next_sentence_labels, args
+                )
+                data_time = t1 - t0
+                xchg_time = t2 - t1
+                fwd_time = t3 - t2
+                bwd_time = t4 - t3
+                clip_time = t40 - t4
+                sync_time = t41 - t40
+                allr_time = t42 - t41
+                opt_time = t5 - t42
+                total_time = t5 - t0
+                nnz = input_mask.sum()
+
+                if (
+                    args.dist_profile
+                    and torch.distributed.is_initialized()
+                    and args.world_size > 1
+                ):
+                    l_time = [
+                        nnz,
+                        fwd_time,
+                        bwd_time,
+                        fwd_time + bwd_time,
+                        sync_time,
+                        allr_time,
+                        total_time,
+                        t40,
+                        t41,
+                        t42,
+                    ]
+                    t_time = torch.tensor(l_time)
+                    t_all_time = torch.empty([args.world_size, len(l_time)])
+                    l_all_time = list(t_all_time.split(1))
+                    torch.distributed.all_gather(l_all_time, t_time)
+
+                if args.local_rank == 0:
+                    print(
+                        f"Step {training_steps:5d}: loss: {gloss:6.3f} lm_acc: {lm_acc:.3f} seq_acc: {seq_acc:.3f} lbs: {args.train_batch_size} gbs: {total_batch_size} DT: {data_time:.1f} XT: {xchg_time:.1f} FT: {fwd_time:.1f} BT: {bwd_time:.1f} OT: {opt_time:.1f} TT: {total_time:.1f} LR: {optimizer.param_groups[0]['lr']:.3e} GNorm: {ggnorm:.2f} ST: {sync_time:.1f} AR: {allr_time:.1f} GC: {clip_time:.1f}"
+                    )
+                    if (
+                        args.dist_profile
+                        and torch.distributed.is_initialized()
+                        and args.world_size > 1
+                    ):
+
+                        def print_summary(desc, t):
+                            if isinstance(t, torch.Tensor):
+                                t = t.tolist()
+                            tmin = min(t)
+                            tmin_i = t.index(tmin)
+                            tmax = max(t)
+                            tmax_i = t.index(tmax)
+                            tavg = sum(t) / len(t)
+                            print(
+                                f"{desc} min: {tmin:10.1f} ({tmin_i:2d}) max: {tmax:10.1f} ({tmax_i:2d})  diff: {tmax-tmin:10.1f} ({(tmax-tmin)*100.0/tmin:5.1f}) avg: {tavg:10.1f}"
+                            )
+
+                        print_summary("NNZ:", t_all_time[:, 0])
+                        print_summary("FWD:", t_all_time[:, 1])
+                        print_summary("BWD:", t_all_time[:, 2])
+                        print_summary("FBW:", t_all_time[:, 3])
+                        print_summary("SYN:", t_all_time[:, 4])
+                        print_summary("SYS:", t_all_time[:, 7])
+                        print_summary("ARS:", t_all_time[:, 8])
+                        print_summary("ARE:", t_all_time[:, 9])
+                if args.local_rank == 0 and args.profile and args.use_pcl:
+                    pcl_bert.print_debug_timers()
+                if args.benchmark_steps > 0 and global_step + 1 >= args.benchmark_steps:
+                    mlperf_logger.barrier()
+                    if args.local_rank == 0:
+                        print(f"Done Benchmarking {args.benchmark_steps} steps.")
+                    sys.exit(0)
+
+                update_step = training_steps % args.gradient_accumulation_steps == 0
+                divisor = args.gradient_accumulation_steps
+                if args.log_freq > 0:
+                    average_loss += loss.item()
+                if update_step:
+                    now_lr = optimizer.param_groups[0]["lr"]
+                    # optimizer.step()
+                    global_step += 1
+                    if (
+                        args.eval_dir
+                        and args.eval_iter_samples > 0
+                        and global_step == next_eval_step
+                    ):
+                        # on first eval, get eval_dataloader
+                        if eval_count == 0:
+                            eval_dataloader = create_eval_dataset(
+                                args, worker_init_fn=worker_init
+                            )  # eval_dataset_future.result(timeout=None)
+                        samples_trained = (
+                            global_step
+                            * args.train_batch_size
+                            * args.gradient_accumulation_steps
+                            * args.world_size
+                        )
+                        samples_trained_prev = samples_trained
+                        eval_avg_loss, eval_avg_mlm_accuracy = run_eval(
+                            model,
+                            eval_dataloader,
+                            device,
+                            args.num_eval_examples,
+                            args,
+                            first_eval=(eval_count == 0),
+                        )
+                        if args.local_rank == 0 or args.local_rank == -1:
+                            mlperf_logger.log_event(
+                                key=mlperf_logger.constants.EVAL_ACCURACY,
+                                value=eval_avg_mlm_accuracy.numpy(),
+                                metadata={"epoch_num": epoch},
+                                sync=False,
+                            )
+                            print(
+                                {
+                                    "global_steps": global_step,
+                                    "eval_loss": eval_avg_loss,
+                                    "eval_mlm_accuracy": eval_avg_mlm_accuracy,
+                                }
+                            )
+
+                        if args.target_mlm_accuracy:
+                            if eval_avg_mlm_accuracy >= args.target_mlm_accuracy:
+                                end_training, converged = True, True
+                                if utils.is_main_process():
+                                    print(
+                                        "%f > %f, Target MLM Accuracy reached at %d"
+                                        % (
+                                            eval_avg_mlm_accuracy,
+                                            args.target_mlm_accuracy,
+                                            global_step,
+                                        )
+                                    )
+
+                        eval_count += 1
+                        next_eval_step = eval_steps[eval_count]
+                if args.target_mlm_accuracy and args.train_mlm_accuracy_window_size > 0:
+                    accuracy_scores.append(mlm_acc)
+                    if update_step:
+                        accuracy_scores = accuracy_scores[
+                            -args.train_mlm_accuracy_window_size
+                            * args.gradient_accumulation_steps :
+                        ]
+                        avg_mlm_accuracy[0] = sum(accuracy_scores) / len(
+                            accuracy_scores
+                        )
+                        torch.distributed.all_reduce(
+                            avg_mlm_accuracy, op=torch.distributed.ReduceOp.SUM
+                        )
+                        avg_mlm_accuracy /= args.world_size
+
+                if (
+                    args.log_freq > 0
+                    and training_steps
+                    % (args.log_freq * args.gradient_accumulation_steps)
+                    == 0
+                ):
+                    samples_trained = (
+                        global_step
+                        * args.train_batch_size
+                        * args.gradient_accumulation_steps
+                        * args.world_size
+                    )
+                    if args.local_rank == 0 or args.local_rank == -1:
+                        time_interval = time.time() - now_time
+                        step_interval = global_step - now_step
+                        now_time = time.time()
+                        now_step = global_step
+                        training_perf = (
+                            args.train_batch_size
+                            * args.gradient_accumulation_steps
+                            * args.world_size
+                            * (step_interval + skip_interval)
+                            / time_interval
+                        )
+                        skip_interval = 0
+
+                        if args.train_mlm_accuracy_window_size > 0:
+                            print(
+                                {
+                                    "training_steps": training_steps,
+                                    "average_loss": average_loss
+                                    / (args.log_freq * divisor),
+                                    "step_loss": loss.item()
+                                    * args.gradient_accumulation_steps
+                                    / divisor,
+                                    "learning_rate": now_lr,
+                                    "seq/s": training_perf,
+                                    "global_steps": now_step,
+                                    "samples_trained": samples_trained,
+                                    "skipped_steps": now_skipped,
+                                    "timestamp": now_time,
+                                    "mlm_accuracy": avg_mlm_accuracy[0].item(),
+                                }
+                            )
+                        else:
+                            print(
+                                {
+                                    "training_steps": training_steps,
+                                    "average_loss": average_loss
+                                    / (args.log_freq * divisor),
+                                    "step_loss": loss.item()
+                                    * args.gradient_accumulation_steps
+                                    / divisor,
+                                    "learning_rate": now_lr,
+                                    "seq/s": training_perf,
+                                    "global_steps": now_step,
+                                    "samples_trained": samples_trained,
+                                    "skipped_steps": now_skipped,
+                                    "timestamp": now_time,
+                                }
+                            )
+
+                        # for DLFW CI/CD
+                        mlperf_logger.log_event(
+                            key="tracked_stats",
+                            value={
+                                "seq/sec": training_perf,
+                                "step_loss": loss.item()
+                                * args.gradient_accumulation_steps
+                                / divisor,
+                                "avg_loss": average_loss / (args.log_freq * divisor),
+                                "lr": now_lr,
+                            },
+                            metadata={"step": (epoch, training_steps)},
+                            sync=False,
+                        )
+
+                        mlperf_logger.log_event(key="throughput", value=training_perf)
+
+                    average_loss = 0
+
+                if global_step >= args.max_steps or end_training:
+                    status = "success" if converged else "aborted"
+                    end_training = True
+                    train_time_raw = time.time() - raw_train_start
+                    average_loss = torch.tensor(average_loss, dtype=torch.float32)
+                    if args.log_freq > 0:
+                        last_num_steps = (
+                            int(training_steps / args.gradient_accumulation_steps)
+                            % args.log_freq
+                        )
+                        last_num_steps = (
+                            args.log_freq if last_num_steps == 0 else last_num_steps
+                        )
+                        average_loss = average_loss / (last_num_steps * divisor)
+                    if torch.distributed.is_initialized():
+                        average_loss /= args.world_size
+                        torch.distributed.all_reduce(average_loss)
+                    final_loss = average_loss.item()
+                    if utils.is_main_process():
+                        if args.train_mlm_accuracy_window_size > 0:
+                            print(
+                                (
+                                    epoch,
+                                    training_steps / args.gradient_accumulation_steps,
+                                ),
+                                {
+                                    "final_loss": final_loss,
+                                    "final_mlm_accuracy": avg_mlm_accuracy[0].item(),
+                                },
+                            )
+                        else:
+                            print(
+                                (
+                                    epoch,
+                                    training_steps / args.gradient_accumulation_steps,
+                                ),
+                                {"final_loss": final_loss},
+                            )
+
+                if end_training or (
+                    samples_trained - samples_trained_prev
+                    >= args.num_samples_per_checkpoint
+                    and samples_trained >= args.min_samples_to_start_checkpoints
+                ):
+                    samples_trained_prev = samples_trained
+                    if utils.is_main_process() and not args.skip_checkpoint:
+                        # Save a trained model
+                        model_to_save = (
+                            model.module if hasattr(model, "module") else model
+                        )  # Only save the model it-self
+                        if args.phase2:
+                            output_save_file = os.path.join(
+                                args.output_dir,
+                                "phase2_ckpt_{}.pt".format(samples_trained),
+                            )
+                        else:
+                            output_save_file = os.path.join(
+                                args.output_dir,
+                                "phase1_ckpt_{}.pt".format(samples_trained),
+                            )
+                        if args.do_train:
+                            torch.save(
+                                {
+                                    "model": model_to_save.state_dict(),
+                                    "optimizer": optimizer.state_dict(),
+                                    "files": [f_id] + files,
+                                },
+                                output_save_file,
+                            )
+
+                            most_recent_ckpts_paths.append(output_save_file)
+                            if (
+                                len(most_recent_ckpts_paths)
+                                > args.keep_n_most_recent_checkpoints
+                            ):
+                                ckpt_to_be_removed = most_recent_ckpts_paths.pop(0)
+                                os.remove(ckpt_to_be_removed)
+
+                    if samples_trained >= args.max_samples_termination or end_training:
+                        status = "success" if converged else "aborted"
+                        end_training = True
+                        break
+                t0 = get_time()
+
+            del train_dataloader
+
+            if samples_trained >= args.max_samples_termination or end_training:
+                status = "success" if converged else "aborted"
+                end_training = True
+                break
+
+            if not need_next_training_shard:
+                # dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, shared_file_list, args, worker_init_fn=worker_init)
+                pass
+            # train_dataloader, data_file = dataset_future.result(timeout=None)
+            train_dataloader, data_file = create_pretraining_dataset(
+                data_file,
+                args.max_predictions_per_seq,
+                shared_file_list,
+                args,
+                worker_init_fn=worker_init,
+            )
+        mlperf_logger.log_end(
+            key=mlperf_logger.constants.BLOCK_STOP,
+            metadata={"first_epoch_num": epoch},
+            sync=False,
+        )
+        mlperf_logger.log_end(
+            key=mlperf_logger.constants.EPOCH_STOP,
+            metadata={"epoch_num": epoch},
+            sync=False,
+        )
+        epoch += 1
+
+    mlperf_logger.log_event(
+        key=mlperf_logger.constants.TRAIN_SAMPLES, value=samples_trained, sync=False
+    )
+    mlperf_logger.log_event(
+        key=mlperf_logger.constants.EVAL_SAMPLES,
+        value=args.num_eval_examples,
+        sync=False,
+    )
+    mlperf_logger.log_end(
+        key=mlperf_logger.constants.RUN_STOP, metadata={"status": status}, sync=False
+    )
+    return args, final_loss, train_time_raw
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/requirements.txt b/models/language_modeling/pytorch/bert_large/training/gpu/requirements.txt
new file mode 100644
index 000000000..c9ef171f4
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/requirements.txt
@@ -0,0 +1,25 @@
+# Python dependencies required for development
+astunparse
+batchgenerators>=0.20.0
+expecttest
+future
+medpy
+numpy
+pandas
+psutil
+pyyaml
+requests
+SimpleITK
+scikit-learn
+scipy==1.10.0
+setuptools
+six
+types-dataclasses
+typing_extensions
+dataclasses; python_version<"3.7"
+tensorflow-cpu
+transformers
+tensorboard
+tensorboardX
+absl-py
+gdown
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/run_pretrain_mlperf.py b/models/language_modeling/pytorch/bert_large/training/gpu/run_pretrain_mlperf.py
new file mode 100644
index 000000000..87319c9c9
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/run_pretrain_mlperf.py
@@ -0,0 +1,1095 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright (c) 2022-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...)
+on a text file or a dataset without using HuggingFace Trainer.
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=masked-lm
+"""
+# You can also adapt this script on your own mlm task. Pointers for this are left as comments.
+
+"""BERT Pretraining"""
+
+import argparse
+import csv
+import h5py
+import os
+import glob
+import numpy as np
+import torch
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, Dataset
+from torch.utils.data.distributed import DistributedSampler
+import logging
+import math
+import multiprocessing
+import random
+import re
+import time
+import sys
+
+from collections import OrderedDict
+from concurrent.futures import ProcessPoolExecutor
+
+from schedulers import LinearWarmupPolyDecayScheduler
+
+import utils
+
+import torch.nn.functional as F
+import argparse
+
+import transformers
+from transformers import (
+    CONFIG_MAPPING,
+    MODEL_MAPPING,
+    AutoConfig,
+    AutoModelForPreTraining,
+    SchedulerType,
+    get_scheduler,
+    set_seed,
+)
+
+from schedulers import LinearWarmUpScheduler, LinearWarmupPolyDecayScheduler
+from lamb import Lamb
+
+try:
+    import intel_extension_for_pytorch as ipex
+except:
+    ipex = None
+
+ref_time = 0
+
+
+def get_time():
+    global ref_time
+    t = time.time()
+    return (t - ref_time) * 1000.0
+
+
+def sync_clock():
+    global ref_time
+    t = time.time()
+    if torch.distributed.is_initialized():
+        torch.distributed.barrier()
+        t = time.time()
+        t = torch.tensor([t], dtype=torch.double)
+        torch.distributed.broadcast(t, 0)
+        torch.distributed.barrier()
+        t = t.item()
+    ref_time = t
+
+
+def synchronize(device='cpu'):
+    if device == 'cuda':
+        if torch.distributed.is_available() and torch.distributed.is_initialized():
+            torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
+            torch.cuda.synchronize()
+        else:
+            torch.cuda.synchronize()
+    elif device=='xpu':
+        torch.xpu.synchronize()
+
+
+def get_time_sync(device='cpu'):
+    synchronize(device)
+    return get_time()
+
+
+logger = logging.getLogger(__name__)
+MODEL_CONFIG_CLASSES = list(MODEL_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+class WorkerInitObj(object):
+    def __init__(self, seed):
+        self.seed = seed
+
+    def __call__(self, id):
+        np.random.seed(seed=self.seed + id)
+        random.seed(self.seed + id)
+
+
+def get_eval_batchsize_per_worker(args):
+    if torch.distributed.is_initialized():
+        chunk_size = args.num_eval_examples // args.world_size
+        rank = args.local_rank
+        remainder = args.num_eval_examples % args.world_size
+        if rank < remainder:
+            return chunk_size + 1
+        else:
+            return chunk_size
+
+
+class pretraining_dataset(Dataset):
+    def __init__(self, input_file, max_pred_length):
+        self.input_file = input_file
+        self.max_pred_length = max_pred_length
+        f = h5py.File(input_file, "r")
+        keys = [
+            "input_ids",
+            "input_mask",
+            "segment_ids",
+            "masked_lm_positions",
+            "masked_lm_ids",
+            "next_sentence_labels",
+        ]
+        self.inputs = [np.asarray(f[key][:]) for key in keys]
+        print(f"Loaded {len(self.inputs[0]):d} samples from datafile: {input_file}")
+        f.close()
+
+    def __len__(self):
+        "Denotes the total number of samples"
+        return len(self.inputs[0])
+
+    def __getitem__(self, index):
+        [
+            input_ids,
+            input_mask,
+            segment_ids,
+            masked_lm_positions,
+            masked_lm_ids,
+            next_sentence_labels,
+        ] = [
+            torch.from_numpy(input[index].astype(np.int64))
+            if indice < 5
+            else torch.from_numpy(np.asarray(input[index].astype(np.int64)))
+            for indice, input in enumerate(self.inputs)
+        ]
+        masked_lm_labels = torch.zeros(input_ids.shape, dtype=torch.long) - 100
+        index = self.max_pred_length
+        masked_token_count = torch.count_nonzero(masked_lm_positions)
+        if masked_token_count != 0:
+            index = masked_token_count
+        masked_lm_labels[masked_lm_positions[:index]] = masked_lm_ids[:index]
+
+        return [
+            input_ids,
+            segment_ids,
+            input_mask,
+            masked_lm_labels,
+            next_sentence_labels,
+        ]
+
+
+def create_pretraining_dataset(input_file, max_pred_length, shared_list, args, worker_init_fn):
+    train_data = pretraining_dataset(
+        input_file=input_file, max_pred_length=max_pred_length
+    )
+    train_sampler = RandomSampler(train_data)
+    train_dataloader = DataLoader(
+        train_data, sampler=train_sampler, batch_size=args.train_batch_size
+    )
+    return train_dataloader, input_file
+
+
+def create_eval_dataset(args, worker_init_fn):
+    eval_data = []
+    for eval_file in sorted(os.listdir(args.eval_dir)):
+        eval_file_path = os.path.join(args.eval_dir, eval_file)
+
+        if os.path.isfile(eval_file_path) and "part" in eval_file_path:
+            eval_data.extend(
+                pretraining_dataset(
+                    eval_file_path, max_pred_length=args.max_predictions_per_seq
+                )
+            )
+            if len(eval_data) > args.num_eval_examples:
+                eval_data = eval_data[: args.num_eval_examples]
+                break
+    if torch.distributed.is_initialized():
+        chunk_size = args.num_eval_examples // args.world_size
+        rank = args.local_rank
+        remainder = args.num_eval_examples % args.world_size
+        if rank < remainder:
+            eval_data = eval_data[
+                (chunk_size + 1) * rank : (chunk_size + 1) * (rank + 1)
+            ]
+        else:
+            eval_data = eval_data[
+                chunk_size * rank + remainder : chunk_size * (rank + 1) + remainder
+            ]
+
+    eval_sampler = SequentialSampler(eval_data)
+    eval_dataloader = DataLoader(
+        eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size, num_workers=0
+    )
+    return eval_dataloader
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Finetune a transformers model on a Masked Language Modeling task"
+    )
+
+    ## Required parameters
+    parser.add_argument("--input_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input data dir. Should contain .hdf5 files  for the task.")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model checkpoints will be written.")
+    parser.add_argument("--eval_dir",
+                        default=None,
+                        type=str,
+                        help="The eval data dir. Should contain .hdf5 files  for the task.")
+    parser.add_argument("--eval_iter_start_samples",
+                        default=3000000,
+                        type=int,
+                        help="Sample to begin performing eval.")
+    parser.add_argument("--eval_iter_samples",
+                        default=16,
+                        type=int,
+                        help="If set to -1, disable eval, \
+                        else evaluate every eval_iter_samples during training")
+    parser.add_argument("--num_eval_examples",
+                        default=10000,
+                        type=int,
+                        help="number of eval examples to run eval on")
+    parser.add_argument("--init_checkpoint",
+                        default=None,
+                        type=str,
+                        help="The initial checkpoint to start training from.")
+    parser.add_argument("--init_tf_checkpoint",
+                        default=None,
+                        type=str,
+                        help="The initial TF checkpoint to start training from.")
+    parser.add_argument(
+        "--train_file", type=str, default=None, help="A csv or a json file containing the training data."
+    )
+    parser.add_argument(
+        "--validation_file", type=str, default=None, help="A csv or a json file containing the validation data."
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        type=str,
+        help="Path to pretrained model or model identifier from huggingface.co/models.",
+    )
+    parser.add_argument(
+        "--config_name",
+        type=str,
+        default=None,
+        help="Pretrained config name or path if not the same as model_name",
+    )
+    parser.add_argument(
+        "--per_device_eval_batch_size",
+        type=int,
+        default=8,
+        help="Batch size (per device) for the evaluation dataloader.",
+    )
+    parser.add_argument(
+        "--learning_rate",
+        type=float,
+        default=5e-5,
+        help="Initial learning rate (after the potential warmup period) to use.",
+    )
+    parser.add_argument("--max_predictions_per_seq",
+                        default=76,
+                        type=int,
+                        help="The maximum total of masked tokens in input sequence")
+    parser.add_argument("--train_batch_size",
+                        default=8,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--eval_batch_size",
+                        default=16,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--weight_decay_rate",
+                        default=0.01,
+                        type=float,
+                        help="weight decay rate for LAMB.")
+    parser.add_argument("--opt_lamb_beta_1",
+                        default=0.9,
+                        type=float,
+                        help="LAMB beta1.")
+    parser.add_argument("--opt_lamb_beta_2",
+                        default=0.999,
+                        type=float,
+                        help="LAMB beta2.")
+    parser.add_argument("--max_steps",
+                        default=1536,
+                        type=float,
+                        help="Total number of training steps to perform.")
+    parser.add_argument("--max_samples_termination",
+                        default=14000000,
+                        type=float,
+                        help="Total number of training samples to run.")
+    parser.add_argument("--warmup_proportion",
+                        default=0.01,
+                        type=float,
+                        help="Proportion of optimizer update steps to perform linear learning rate warmup for. "
+                             "Typically 1/8th of steps for Phase2")
+    parser.add_argument("--warmup_steps",
+                        default=0,
+                        type=float,
+                        help="Number of optimizer update steps to perform linear learning rate warmup for. "
+                             "Typically 1/8th of steps for Phase2")
+    parser.add_argument("--start_warmup_step",
+                        default=0,
+                        type=float,
+                        help="Starting step for warmup. ")
+    parser.add_argument('--log_freq',
+                        type=float, 
+                        default=10000.0,
+                        help='frequency of logging loss. If not positive, no logging is provided for training loss')
+    parser.add_argument('--checkpoint_activations',
+                        default=False,
+                        action='store_true',
+                        help="Whether to use gradient checkpointing")
+    parser.add_argument("--resume_from_checkpoint",
+                        default=False,
+                        action='store_true',
+                        help="Whether to resume training from checkpoint. If set, precedes init_checkpoint/init_tf_checkpoint")
+    parser.add_argument('--keep_n_most_recent_checkpoints',
+                        type=int,
+                        default=20,
+                        help="Number of checkpoints to keep (rolling basis).")
+    parser.add_argument('--num_samples_per_checkpoint',
+                        type=int,
+                        default=500000,
+                        help="Number of update steps until a model checkpoint is saved to disk.")
+    parser.add_argument('--min_samples_to_start_checkpoints',
+                        type=int,
+                        default=3000000,
+                        help="Number of update steps until model checkpoints start saving to disk.")
+    parser.add_argument('--skip_checkpoint',
+                        default=False,
+                        action='store_true',
+                        help="Whether to save checkpoints")
+    parser.add_argument('--phase2',
+                        default=False,
+                        action='store_true',
+                        help="Only required for checkpoint saving format")
+    parser.add_argument("--do_train",
+                        default=False,
+                        action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument('--bert_config_path',
+                        type=str,
+                        default="/workspace/phase1",
+                        help="Path bert_config.json is located in")
+    parser.add_argument('--target_mlm_accuracy',
+                        type=float,
+                        default=0.72,
+                        help="Stop training after reaching this Masked-LM accuracy")
+    parser.add_argument('--train_mlm_accuracy_window_size',
+                        type=int,
+                        default=0,
+                        help="Average accuracy over this amount of batches before performing a stopping criterion test")
+    parser.add_argument('--num_epochs_to_generate_seeds_for',
+                        type=int,
+                        default=2,
+                        help="Number of epochs to plan seeds for. Same set across all workers.")
+    parser.add_argument("--use_gradient_as_bucket_view",
+                        default=False,
+                        action='store_true',
+                        help="Turn ON gradient_as_bucket_view optimization in native DDP.")
+    parser.add_argument("--dense_seq_output",
+                        default=False,
+                        action='store_true',
+                        help="Whether to run with optimizations.")
+    parser.add_argument(
+        "--no_ddp", default=False, action="store_true", help="Whether to use DDP."
+    )
+    parser.add_argument(
+        "--dist_lamb",
+        default=False,
+        action="store_true",
+        help="Whether to use DistLamb from pcl_bert",
+    )
+
+    # For dtype specific training
+    parser.add_argument("--bf16",
+                        default=False,
+                        action='store_true',
+                        help="Enale BFloat16 training")
+    parser.add_argument("--bf32",
+                        default=False,
+                        action='store_true',
+                        help="Enale BFloat32 training")
+    
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="Weight decay to use.")
+    parser.add_argument("--num_train_epochs", type=int, default=3, help="Total number of training epochs to perform.")
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--lr_scheduler_type",
+        type=SchedulerType,
+        default="linear",
+        help="The scheduler type to use.",
+        choices=["linear", "cosine", "cosine_with_restarts", "polynomial", "constant", "constant_with_warmup"],
+    )
+    parser.add_argument("--seed", type=int, default=42, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--model_type",
+        type=str,
+        default=None,
+        help="Model type to use if training from scratch.",
+        choices=MODEL_TYPES,
+    )
+    
+    parser.add_argument(
+        "--preprocessing_num_workers",
+        type=int,
+        default=None,
+        help="The number of processes to use for the preprocessing.",
+    )
+    parser.add_argument("--local_rank",
+                        default=0,
+                        type=int,
+                        help="Total batch size for training.")
+    parser.add_argument("--world_size",
+                        default=1,
+                        type=int,
+                        help="Total batch size for training.")
+
+    parser.add_argument("--profile", action="store_true", help="Whether to enable profiling")
+    parser.add_argument("--export_chrome_trace", action="store_true", help="Exports the collected trace in Chrome JSON format.")
+    parser.add_argument(
+        "--skip_exchange_padding",
+        default=False,
+        action="store_true",
+        help="Whether to skip padding exchange.",
+    )
+    parser.add_argument(
+        "--unpad",
+        default=False,
+        action="store_true",
+        help="Whether to run with unpadding.",
+    )
+    parser.add_argument(
+        "--benchmark_steps",
+        type=int,
+        default=0,
+        help="Number of steps to run for benchmark.",
+    )
+    parser.add_argument(
+        "--dist_backend",
+        type=str,
+        default="ccl",
+        help="Specify distributed backend to use.",
+    )
+    parser.add_argument('--dist-url', default='127.0.0.1', type=str,
+                     help='url used to set up distributed training')
+    parser.add_argument('--dist-port', default='29500', type=str,
+                    help='url port used to set up distributed training')
+    parser.add_argument(
+        "--multi_instance",
+        action="store_true",
+        help="Skip torch.distributed initialization to Run multiple instance independently",
+    )
+    parser.add_argument(
+        "--dist_profile",
+        action="store_true",
+        help="Whether to enable distributed timing profile",
+    )
+
+    # choose device
+    parser.add_argument(
+        "--device",
+        type=str,
+        default="cpu",
+        help="backend to run",
+    )
+    parser.add_argument(
+        "--amp",
+        action="store_true",
+        help="Whether to enable autocast",
+    )
+    parser.add_argument("--lamb", action="store_true")
+    parser.add_argument("--adamw", action="store_true")
+    parser.add_argument("--num-iterations", default='10000000000', type=str)
+    parser.add_argument("--info", action="store_true")
+
+    args = parser.parse_args()
+    if args.output_dir is not None:
+        os.makedirs(args.output_dir, exist_ok=True)
+    return args
+
+
+def found_resume_checkpoint(args):
+    if args.phase2:
+        checkpoint_str = "phase2_ckpt*.pt"
+    else:
+        checkpoint_str = "phase1_ckpt*.pt"
+    return args.resume_from_checkpoint and len(glob.glob(os.path.join(args.output_dir, checkpoint_str))) > 0
+
+
+def setup_training(args):
+    device = torch.device(args.device)
+    if int(os.environ.get("PMI_SIZE", "0")) > 1 and not args.multi_instance:
+        if args.dist_backend == "ccl":
+            try:
+                import oneccl_bindings_for_pytorch
+            except:
+                print("CCL backend requested but import oneccl_bindings_for_pytorch failed")
+                raise
+        elif args.dist_backend == "mpi":
+            if not torch.distributed.is_mpi_available():
+                try:
+                    import torch_mpi
+                except:
+                    print(
+                        "MPI backend requested but not available try installing torch_mpi module"
+                    )
+                    raise
+        else:
+            raise ValueError(f"{args.dist_backend} backend requested but not supported")
+
+        mpi_world_size = int(os.environ.get('PMI_SIZE', -1))
+        if mpi_world_size > 0:
+            os.environ['MASTER_ADDR'] = args.dist_url #'127.0.0.1'
+            os.environ['MASTER_PORT'] = args.dist_port #'29500'
+            os.environ['RANK'] = os.environ.get('PMI_RANK', -1)
+            os.environ['WORLD_SIZE'] = os.environ.get('PMI_SIZE', -1)
+            args.local_rank = int(os.environ.get('PMI_RANK', -1))
+            args.world_size = int(os.environ.get("WORLD_SIZE", -1))
+            init_method = 'tcp://' + args.dist_url + ':' + args.dist_port
+            torch.distributed.init_process_group(backend=args.dist_backend, init_method=init_method,  world_size=args.world_size, rank=args.local_rank)
+            # refer to https://www.open-mpi.org/faq/?category=running#mpi-environmental-variables for MPI ENV  
+            _local_rank = os.environ['MPI_LOCALRANKID']
+            if 'OMPI_COMM_WORLD_LOCAL_RANK' in os.environ.keys():
+                _local_rank = os.environ['OMPI_COMM_WORLD_LOCAL_RANK']
+            device = torch.device("xpu", int(_local_rank))
+            print('world_size:{}, rank:{}, device:{}'.format(args.world_size, args.local_rank, device))
+
+
+        args.local_rank = torch.distributed.get_rank()
+        args.world_size = torch.distributed.get_world_size()
+        if args.local_rank == 0:
+            print(
+                f"##################Using {args.dist_backend.upper()} dist run with {args.world_size} ranks",
+                flush=True,
+            )
+    if args.gradient_accumulation_steps < 1:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
+            args.gradient_accumulation_steps))
+    if args.train_batch_size % args.gradient_accumulation_steps != 0:
+        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, batch size {} should be divisible".format(
+            args.gradient_accumulation_steps, args.train_batch_size))
+
+    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
+
+    if not (args.do_train or (args.eval_dir and args.eval_iter_samples <= 0)):
+        raise ValueError(" `do_train`  or should be in offline eval mode")
+
+    if not args.resume_from_checkpoint or not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir, exist_ok=True)
+    
+    return device, args
+
+
+optimize_dtype = None
+
+def prepare_model_and_optimizer(args, device):
+    global optimize_dtype
+    if args.bf16:
+        optimize_dtype = torch.bfloat16
+    else:
+        optimize_dtype = torch.float32
+    
+    global_step = 0
+    args.resume_step = 0
+    checkpoint = None
+    # download model & vocab.
+    if args.config_name:
+        config = AutoConfig.from_pretrained(args.config_name)
+    elif args.model_name_or_path:
+        config = AutoConfig.from_pretrained(args.model_name_or_path)
+    else:
+        config = CONFIG_MAPPING[args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    config.dense_seq_output = args.dense_seq_output 
+    if args.model_name_or_path:
+        model = AutoModelForPreTraining.from_pretrained(
+            args.model_name_or_path,
+            from_tf=bool(".ckpt" in args.model_name_or_path),
+            config=config,
+            )
+    else:
+        logger.info("Training new model from scratch")
+        model = AutoModelForPreTraining.from_config(config)
+
+    param_optimizer = list(model.named_parameters())
+    
+    no_decay = ['bias', 'gamma', 'beta', 'LayerNorm']
+
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay_rate},
+        {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}]
+    
+    print(
+        "GroupSizes: ",
+        [sum([p.numel() for p in g["params"]]) for g in optimizer_grouped_parameters],
+    )
+    
+    model = model.to(device)
+    if torch.distributed.is_initialized() and not args.no_ddp:
+        torch.xpu.set_device(device)
+        model.xpu(device)
+    if args.dist_lamb:
+        raise NotImplementedError('Not implemeted for dist_lamb')
+    elif args.adamw:
+        if args.device == 'xpu':
+            optimizer = torch.optim.AdamW(
+                optimizer_grouped_parameters,
+                lr=args.learning_rate)
+            model, optimizer = torch.xpu.optimize(model=model, optimizer=optimizer, dtype=optimize_dtype)
+        elif args.device == 'cuda':
+            optimizer = torch.optim.AdamW(
+                optimizer_grouped_parameters,
+                lr=args.learning_rate)
+    elif args.lamb:
+        optimizer = Lamb(
+            optimizer_grouped_parameters, 
+            lr=args.learning_rate, 
+            betas=(args.opt_lamb_beta_1, args.opt_lamb_beta_2))
+    else:
+        raise NotImplementedError('Please use valid optimizer')
+
+    if args.warmup_steps == 0:
+        warmup_steps = int(args.max_steps * args.warmup_proportion)
+        warmup_start = 0
+    else:
+        warmup_steps = args.warmup_steps
+        warmup_start = args.start_warmup_step
+    
+    lr_scheduler = LinearWarmupPolyDecayScheduler(
+        optimizer, 
+        start_warmup_steps=warmup_start, 
+        warmup_steps=warmup_steps,
+        total_steps=args.max_steps, 
+        end_learning_rate=0.0, 
+        degree=1.0)
+    
+    if args.device == 'xpu':
+        pass
+    else:
+        if args.bf16 and not args.amp:
+            model = model.bfloat16()
+
+    if torch.distributed.is_initialized() and not args.no_ddp:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device], find_unused_parameters=True)
+    return model, optimizer, lr_scheduler, checkpoint, global_step
+
+
+def run_eval(
+    model,
+    eval_dataloader,
+    device,
+    num_eval_examples,
+    args,
+    first_eval=False,
+    use_cache=False,
+):
+    model.eval()
+    total_eval_loss, total_eval_mlm_acc = 0.0, 0.0
+    total_masked = 0
+    with torch.no_grad():
+        for batch in eval_dataloader:
+            (
+                input_ids,
+                segment_ids,
+                input_mask,
+                masked_lm_labels,
+                next_sentence_labels,
+            ) = batch
+            input_ids = input_ids.to(device)
+            segment_ids = segment_ids.to(device)
+            input_mask = input_mask.to(device)
+            masked_lm_labels = masked_lm_labels.to(device)
+            next_sentence_labels = next_sentence_labels.to(device)
+            outputs = model(
+                input_ids=input_ids,
+                token_type_ids=segment_ids,
+                attention_mask=input_mask,
+                labels=masked_lm_labels,
+                next_sentence_label=next_sentence_labels,
+            )
+            mlm_acc, num_masked = calc_mlm_acc(
+                outputs, masked_lm_labels, args.dense_seq_output
+            )
+            total_eval_loss += outputs.loss.item() * num_masked
+            total_eval_mlm_acc += mlm_acc * num_masked
+            total_masked += num_masked
+    model.train()
+    total_masked = torch.tensor(total_masked, device=device, dtype=torch.int64)
+    total_eval_loss = torch.tensor(total_eval_loss, device=device, dtype=torch.float64)
+    if torch.distributed.is_initialized():
+        # Collect total scores from all ranks
+        torch.distributed.all_reduce(
+            total_eval_mlm_acc, op=torch.distributed.ReduceOp.SUM
+        )
+        torch.distributed.all_reduce(total_eval_loss, op=torch.distributed.ReduceOp.SUM)
+        torch.distributed.all_reduce(total_masked, op=torch.distributed.ReduceOp.SUM)
+
+    # Average by number of examples
+    total_eval_mlm_acc /= total_masked
+    total_eval_loss /= total_masked
+
+    return total_eval_loss, total_eval_mlm_acc
+
+
+def global_batch_size(args):
+    return args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+
+
+def calc_mlm_acc(outputs, masked_lm_labels, dense_seq_output=False):
+    prediction_scores = outputs.prediction_logits
+    masked_lm_labels_flat = masked_lm_labels.view(-1)
+    mlm_labels = masked_lm_labels_flat[masked_lm_labels_flat != -100]
+    if not dense_seq_output:
+        prediction_scores_flat = prediction_scores.view(-1, prediction_scores.shape[-1])
+        mlm_predictions_scores = prediction_scores_flat[masked_lm_labels_flat != -100]
+        mlm_predictions = mlm_predictions_scores.argmax(dim=-1)
+    else:
+        mlm_predictions = prediction_scores.argmax(dim=-1)
+
+    num_masked = mlm_labels.numel()
+    mlm_acc = (mlm_predictions == mlm_labels).sum(dtype=torch.float) / num_masked
+
+    return mlm_acc, num_masked
+
+
+def calc_accuracy(outputs, masked_lm_labels, next_sentence_label, args):
+    loss = outputs.loss.item()
+    prediction_logits = outputs.prediction_logits
+    seq_relationship_logits = outputs.seq_relationship_logits
+    mlm_acc, num_masked = calc_mlm_acc(outputs, masked_lm_labels, args.dense_seq_output)
+    seq_acc_t = torch.argmax(seq_relationship_logits, dim=-1).eq(next_sentence_label.view([-1])).to(torch.float)
+    seq_acc_true, seq_tot = seq_acc_t.sum().item(), seq_acc_t.numel()
+    seq_acc = seq_acc_true / seq_tot
+    return loss, mlm_acc, num_masked, seq_acc, seq_tot
+
+
+def main():
+    latency_list = []
+    args = parse_args()
+    status = 'aborted'  # later set to 'success' if termination criteria met
+    device, args = setup_training(args)
+    print("####################args.world_size={}, args.local_rank={}".format(args.world_size, args.local_rank))
+    total_batch_size = global_batch_size(args) 
+    # Initialize the accelerator. We will let the accelerator handle device placement for us in this example.
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    if args.local_rank == 0 or args.local_rank == -1:
+        print("parsed args:")
+        print(args)
+    # Prepare optimizer
+    model, optimizer, lr_scheduler, checkpoint, global_step = prepare_model_and_optimizer(args, device)
+    worker_seeds, shuffling_seeds = utils.setup_seeds(args.seed, args.num_epochs_to_generate_seeds_for, device)
+    worker_seed = worker_seeds[args.local_rank]
+    random.seed(worker_seed)
+    np.random.seed(worker_seed)
+    torch.manual_seed(worker_seed)
+    worker_init = WorkerInitObj(worker_seed)
+
+    samples_trained = global_step * args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+    final_loss = float("inf")
+    train_time_raw = float("inf")
+    raw_train_start = time.time()
+
+    if args.do_train:
+        model.train()
+        most_recent_ckpts_paths = []
+        average_loss = 0.0  # averaged loss every args.log_freq steps
+        epoch = 1
+        training_steps = 0
+        end_training, converged = False, False
+        samples_trained_prev = 0
+
+        # pre-compute eval boundaries
+        samples_trained_per_step = args.train_batch_size * args.gradient_accumulation_steps * args.world_size
+        start, stop, step = args.eval_iter_start_samples, args.max_samples_termination, args.eval_iter_samples
+        eval_steps = [math.ceil(i/samples_trained_per_step) for i in np.arange(start, stop, step)]
+        eval_count = 0
+        next_eval_step = eval_steps[eval_count]
+
+        if args.target_mlm_accuracy:
+            if args.train_mlm_accuracy_window_size > 0:
+                accuracy_scores = []
+                avg_mlm_accuracy = torch.Tensor([0])
+
+        first_epoch = True
+        if found_resume_checkpoint(args):
+            # dead path
+            f_start_id = checkpoint['files'][0]
+            files = checkpoint['files'][1:]
+            num_files = len(files)
+        else:
+            files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
+                     os.path.isfile(os.path.join(args.input_dir, f)) and 'part' in f]
+            files.sort()
+            num_files = len(files)
+            random.Random(shuffling_seeds[epoch%len(shuffling_seeds)]).shuffle(files)
+            f_start_id = 0
+    
+    # if torch.distributed.is_initialized() and not args.no_ddp:
+    #     model = torch.nn.parallel.DistributedDataParallel(model,
+    #                                                     #   find_unused_parameters=True,
+    #                                                     #   bucket_cap_mb=8192,
+    #                                                       gradient_as_bucket_view=args.use_gradient_as_bucket_view)
+
+    sync_clock()
+
+    while global_step < args.max_steps and not end_training:
+        if args.local_rank == 0 or args.local_rank == -1:
+            now_time = time.time()
+            print("epoch:", epoch)
+
+        # Reshuffle file list on subsequent epochs
+        if not first_epoch:
+            files = [os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if
+                     os.path.isfile(os.path.join(args.input_dir, f)) and 'part' in f]
+            files.sort()
+            num_files = len(files)
+            random.Random(shuffling_seeds[epoch%len(shuffling_seeds)]).shuffle(files)
+            f_start_id = 0
+        first_epoch = False
+
+        shared_file_list = {}
+
+        if torch.distributed.is_initialized() and args.world_size > num_files:
+            remainder = args.world_size % num_files
+            data_file = files[(f_start_id*args.world_size + args.local_rank +
+                               remainder * f_start_id) % num_files]
+        else:
+            data_file = files[(f_start_id*args.world_size + args.local_rank) % num_files]
+        
+        train_data = pretraining_dataset(data_file, args.max_predictions_per_seq)
+        if torch.distributed.is_initialized() and not args.no_ddp:
+            train_sampler = torch.utils.data.distributed.DistributedSampler(train_data)
+        else:
+            train_sampler = RandomSampler(train_data)
+        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
+        
+        for f_id in range(f_start_id, len(files)):
+            if args.world_size > num_files:
+                data_file = files[(f_id*args.world_size + args.local_rank +
+                                   remainder * f_id) % num_files]
+            else:
+                data_file = files[(f_id*args.world_size + args.local_rank)%num_files]
+
+            for step, batch in enumerate(train_dataloader):
+                if training_steps >= 10 + int(args.num_iterations):
+                    latency_list = latency_list[10:]
+                    avg = sum(latency_list) / len(latency_list)
+                    print('bert_train latency:  ' + str(avg) + '  s')
+                    print('bert_train throughput:  ' + str(total_batch_size / args.world_size / avg) + '  sentences/s')
+                    print('perplexity = ' + str(gloss))
+                    return
+                training_steps += 1
+
+                start_h2d = get_time_sync(args.device)
+                
+                input_ids, segment_ids, input_mask, masked_lm_labels, next_sentence_labels = batch
+                input_ids = input_ids.to(device)
+                segment_ids = segment_ids.to(device)
+                input_mask = input_mask.to(device)
+                masked_lm_labels = masked_lm_labels.to(device)
+                next_sentence_labels = next_sentence_labels.to(device)
+
+                end_h2d = get_time_sync(args.device)
+
+                info = {
+                    "input_ids": [input_ids.shape, input_ids.dtype],
+                    "segment_ids": [segment_ids.shape, segment_ids.dtype],
+                    "input_mask": [input_mask.shape, input_mask.dtype],
+                    "masked_lm_labels": [masked_lm_labels.shape, masked_lm_labels.dtype],
+                    "next_sentence_labels": [next_sentence_labels.shape, next_sentence_labels.dtype]
+                }
+                if args.info:
+                    print("datainfo", info)
+
+                def step_training():
+                    outputs = None
+                    if args.bf16 and args.amp:
+                        if args.device == 'cpu':
+                            with torch.cpu.amp.autocast():
+                                outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask,
+                                    labels=masked_lm_labels, next_sentence_label=next_sentence_labels)
+                        elif args.device == 'xpu':
+                            with torch.xpu.amp.autocast(enabled=True, dtype=optimize_dtype):
+                                outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask,
+                                    labels=masked_lm_labels, next_sentence_label=next_sentence_labels)
+                        elif args.device == 'cuda':
+                            with torch.cuda.amp.autocast():
+                                outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask,
+                                    labels=masked_lm_labels, next_sentence_label=next_sentence_labels)
+                    else:
+                        outputs = model(input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask,
+                                    labels=masked_lm_labels, next_sentence_label=next_sentence_labels)
+                    loss = outputs.loss
+                    loss = loss / args.gradient_accumulation_steps
+                    loss.backward()
+                    if hasattr(optimizer, "clip_grad_norm_"):
+                        ggnorm = optimizer.clip_grad_norm_(1.0)
+                    else:
+                        ggnorm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+                    if (step + 1) % args.gradient_accumulation_steps == 0 or step == len(
+                        train_dataloader
+                    ) - 1:
+                        start_opt = get_time_sync(args.device)
+                        optimizer.step()
+                        lr_scheduler.step()
+                        optimizer.zero_grad()
+                        end_opt = get_time_sync(args.device)
+                    return outputs, end_opt - start_opt
+                
+                start_training = get_time_sync(args.device)
+                if args.device == 'xpu':
+                    with torch.autograd.profiler_legacy.profile(args.profile, use_xpu=True) as prof:
+                        outputs, opt_time = step_training()
+                    if args.profile:
+                        print(str(prof.key_averages().table(sort_by="self_xpu_time_total")))
+                        if args.export_chrome_trace and step > 20:
+                            with open('./profile_trace.txt', 'w') as f:
+                                f.write(str(prof.table(sort_by="id", row_limit=100000)))
+                            prof.export_chrome_trace('./profile_trace.json')
+                            raise
+                elif args.device == 'cuda':
+                    if args.profile:
+                        with torch.profiler.profile(
+                            activities=[
+                                # torch.profiler.ProfilerActivity.CPU,
+                                torch.profiler.ProfilerActivity.CUDA,
+                            ]
+                        ) as prof:
+                            outputs, opt_time = step_training()
+                    else:
+                        outputs, opt_time = step_training()
+                    if args.profile:
+                        print(str(prof.key_averages().table(sort_by="self_cuda_time_total")))
+                        if args.export_chrome_trace and step > 20:
+                            prof.export_chrome_trace('./profile_trace.json')
+                            raise
+                end_training = get_time_sync(args.device)
+                
+                timeinfo = {
+                    'h2d': end_h2d - start_h2d,
+                    'training': end_training - start_training,
+                    'opt': opt_time, # training include opt_time
+                }
+                timeinfo['total'] = timeinfo['h2d'] + timeinfo['training']
+                latency_list.append(timeinfo['total']/1000.0)
+                if args.info:
+                    print('timeinfo', timeinfo)
+                if training_steps > 10:
+                    print('train perf:  ' + str(timeinfo['total']/1000.0) + '  s/it')
+
+                gloss, lm_acc, num_masked, seq_acc, seq_tot = calc_accuracy(
+                    outputs, masked_lm_labels, next_sentence_labels, args
+                )
+
+                if args.benchmark_steps > 0 and global_step + 1 >= args.benchmark_steps:
+                    synchronize(args.device)
+                    if args.local_rank == 0:
+                        print(f"Done Benchmarking {args.benchmark_steps} steps.")
+                    sys.exit(0)
+
+                info = {
+                    "gloss": gloss,
+                    "lm_acc": lm_acc,
+                    "num_masked": num_masked,
+                    "seq_acc": seq_acc,
+                    "seq_tot": seq_tot
+                }
+                if args.info:
+                    print("outinfo", info)
+
+                update_step = training_steps % args.gradient_accumulation_steps == 0
+                if update_step:
+                    global_step += 1
+                    if (
+                        args.eval_dir
+                        and args.eval_iter_samples > 0
+                        and global_step == next_eval_step
+                    ):
+                        if eval_count == 0:
+                            eval_dataloader = create_eval_dataset(
+                                args, worker_init_fn=worker_init
+                            )  # eval_dataset_future.result(timeout=None)
+                        samples_trained = (
+                            global_step
+                            * args.train_batch_size
+                            * args.gradient_accumulation_steps
+                            * args.world_size
+                        )
+                        samples_trained_prev = samples_trained
+                        eval_avg_loss, eval_avg_mlm_accuracy = run_eval(
+                            model,
+                            eval_dataloader,
+                            device,
+                            args.num_eval_examples,
+                            args,
+                            first_eval=(eval_count == 0),
+                        )
+                        if args.local_rank == 0 or args.local_rank == -1:
+                            print(
+                                {
+                                    "global_steps": global_step,
+                                    "eval_loss": eval_avg_loss,
+                                    "eval_mlm_accuracy": eval_avg_mlm_accuracy,
+                                }
+                            )
+                        eval_count += 1
+                        next_eval_step = eval_steps[eval_count]
+
+            del train_dataloader
+            if samples_trained >= args.max_samples_termination or end_training:
+                status = 'success' if converged else 'aborted'
+                end_training = True
+                break
+            train_dataloader, data_file = create_pretraining_dataset(
+                data_file,
+                args.max_predictions_per_seq,
+                shared_file_list,
+                args,
+                worker_init_fn=worker_init,
+            )
+        epoch += 1
+    return args, final_loss, train_time_raw
+
+
+if __name__ == "__main__":
+    main()
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/run_pretraining_demo.sh b/models/language_modeling/pytorch/bert_large/training/gpu/run_pretraining_demo.sh
new file mode 100644
index 000000000..d4ee41f4a
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/run_pretraining_demo.sh
@@ -0,0 +1,44 @@
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+python run_pretrain_mlperf.py \
+    --config_name=bert_config.json \
+    --input_dir=data/hdf5_seq_512 \
+    --output_dir=result \
+    --eval_dir=data/hdf5_seq_512 \
+    --device=xpu \
+    --do_train \
+    --train_batch_size=16 \
+    --gradient_accumulation_steps=1 \
+    --bf16 \
+    --adamw --num-iterations 10
+
+# DDP training
+source $(python -c "import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)")/env/setvars.sh
+export LD_PRELOAD=$(python -c "import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)")/lib/libmpi.so
+export ONECCL_BINDINGS_FOR_PYTORCH_ENV_VERBOSE=1
+
+mpiexec -n 2 -l python -u run_pretrain_mlperf.py \
+    --config_name=bert_config.json \
+    --input_dir=data/hdf5_seq_512 \
+    --output_dir=result \
+    --eval_dir=data/hdf5_seq_512 \
+    --device=xpu \
+    --do_train \
+    --train_batch_size=16 \
+    --gradient_accumulation_steps=1 \
+    --bf16 \
+    --adamw --num-iterations 10
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/schedulers.py b/models/language_modeling/pytorch/bert_large/training/gpu/schedulers.py
new file mode 100644
index 000000000..50c5679e9
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/schedulers.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Copyright 2018 The Google AI Language Team Authors and The HugginFace Inc. team.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+import torch
+from torch.optim.optimizer import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+
+
+class LRScheduler(_LRScheduler):
+    def __init__(self, optimizer, last_epoch=-1):
+        # Check if using mixed precision training
+        self.mixed_training = False
+        base_optimizer = optimizer
+
+        # Check that optimizer param is valid
+        if not isinstance(optimizer, Optimizer):
+            raise TypeError('{} is not an Optimizer'.format(
+                type(optimizer).__name__))
+
+        super(LRScheduler, self).__init__(base_optimizer, last_epoch)
+
+    def step(self, epoch=None):
+        # Set the current training step
+        # ('epoch' is used to be consistent with _LRScheduler)
+        if self.mixed_training:
+            # The assumption is that the step will be constant
+            state_dict = self.optimizer.state[self.optimizer.param_groups[0]['params'][0]]
+            if 'step' in state_dict:
+                self.last_epoch = state_dict['step'] + 1
+            else:
+                self.last_epoch = 1
+        else:
+            self.last_epoch = epoch if epoch is not None else self.last_epoch + 1
+
+        for param_group, lr in zip(self.optimizer.param_groups, self.get_lr()):
+            param_group['lr'] = lr
+
+
+class LinearWarmUpScheduler(LRScheduler):
+    """
+    Applies a warm up period to the learning rate.
+    """
+
+    def __init__(self, optimizer, warmup, total_steps, last_epoch=-1):
+        self.warmup = warmup
+        self.total_steps = total_steps
+        super(LinearWarmUpScheduler, self).__init__(optimizer, last_epoch)
+
+    def get_lr(self):
+        progress = self.last_epoch / self.total_steps
+        if progress < self.warmup:
+            return [base_lr * progress / self.warmup for base_lr in self.base_lrs]
+        else:
+            return [base_lr * max((progress - 1.0)/(self.warmup - 1.0), 0.) for base_lr in self.base_lrs]
+
+
+class LinearWarmupPolyDecayScheduler(LRScheduler):
+    """
+    Applies a warm up period to the learning rate.
+    """
+
+    def __init__(self, optimizer, start_warmup_steps, warmup_steps, total_steps, end_learning_rate=0.0, degree=1.0, last_epoch=-1):
+        self.num_warmup_updates = warmup_steps
+        self.start_warmup_steps = start_warmup_steps
+        self.total_steps = total_steps
+        self.end_learning_rate = end_learning_rate
+        self.degree = degree
+        self.offset_step = int(self.start_warmup_steps == 0)
+        super(LinearWarmupPolyDecayScheduler,
+              self).__init__(optimizer, last_epoch)
+
+    def step(self, epoch=None):
+        # Instead of optimizer.param_groups['lr'],
+        # update optimizer._lr to avoid sync
+        state_dict = self.optimizer.state[self.optimizer.param_groups[0]['params'][0]]
+        if 'step' in state_dict:
+            self.last_epoch = int(state_dict['step']) + 1
+        else:
+            self.last_epoch = 1
+        lr = self.get_lr()
+        for param_group in self.optimizer.param_groups:
+            param_group['lr'] = lr
+
+    def get_lr(self):
+        mod_step = self.last_epoch - self.offset_step - self.start_warmup_steps
+        cond = mod_step < self.num_warmup_updates
+        progress = (cond * (mod_step / (self.num_warmup_updates + 1e-6))) + \
+            ((1.0 - cond) * (min((self.last_epoch - self.offset_step) / self.total_steps, 1)))
+        base_lr = self.base_lrs[0]
+        lr = (cond * (base_lr * progress)) + \
+            ((1.0 - cond) * ((base_lr - self.end_learning_rate) *
+                             (1-progress) ** self.degree + self.end_learning_rate))
+        return lr
diff --git a/models/language_modeling/pytorch/bert_large/training/gpu/utils.py b/models/language_modeling/pytorch/bert_large/training/gpu/utils.py
new file mode 100644
index 000000000..c8106ebcf
--- /dev/null
+++ b/models/language_modeling/pytorch/bert_large/training/gpu/utils.py
@@ -0,0 +1,120 @@
+# Copyright (c) 2019 NVIDIA CORPORATION. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import torch
+import torch.distributed as dist
+
+import logging.config
+import random
+
+
+def generate_seeds(rng, size):
+    """
+    Generate list of random seeds
+    :param rng: random number generator
+    :param size: length of the returned list
+    """
+    seeds = [rng.randint(0, 2 ** 32 - 1) for _ in range(size)]
+    return seeds
+
+
+def broadcast_seeds(seeds, device):
+    """
+    Broadcasts random seeds to all distributed workers.
+    Returns list of random seeds (broadcasted from workers with rank 0).
+    :param seeds: list of seeds (integers)
+    :param device: torch.device
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        seeds_tensor = torch.LongTensor(seeds).to(device)
+        torch.distributed.broadcast(seeds_tensor, 0)
+        seeds = seeds_tensor.tolist()
+    return seeds
+
+
+def setup_seeds(master_seed, epochs, device):
+    """
+    Generates seeds from one master_seed.
+    Function returns (worker_seeds, shuffling_seeds), worker_seeds are later
+    used to initialize per-worker random number generators (mostly for
+    dropouts), shuffling_seeds are for RNGs resposible for reshuffling the
+    dataset before each epoch.
+    Seeds are generated on worker with rank 0 and broadcasted to all other
+    workers.
+    :param master_seed: master RNG seed used to initialize other generators
+    :param epochs: number of epochs
+    :param device: torch.device (used for distributed.broadcast)
+    """
+    if master_seed is None:
+        # random master seed, random.SystemRandom() uses /dev/urandom on Unix
+        master_seed = random.SystemRandom().randint(0, 2 ** 32 - 1)
+        if get_rank() == 0:
+            # master seed is reported only from rank=0 worker, it's to avoid
+            # confusion, seeds from rank=0 are later broadcasted to other
+            # workers
+            logging.info(f"Using random master seed: {master_seed}")
+    else:
+        # master seed was specified from command line
+        logging.info(f"Using master seed from command line: {master_seed}")
+
+    # initialize seeding RNG
+    seeding_rng = random.Random(master_seed)
+
+    # generate worker seeds, one seed for every distributed worker
+    worker_seeds = generate_seeds(seeding_rng, get_world_size())
+
+    # generate seeds for data shuffling, one seed for every epoch
+    shuffling_seeds = generate_seeds(seeding_rng, epochs)
+
+    # broadcast seeds from rank=0 to other workers
+    worker_seeds = broadcast_seeds(worker_seeds, device)
+    shuffling_seeds = broadcast_seeds(shuffling_seeds, device)
+    return worker_seeds, shuffling_seeds
+
+
+def get_rank():
+    """
+    Gets distributed rank or returns zero if distributed is not initialized.
+    """
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        rank = torch.distributed.get_rank()
+    else:
+        rank = 0
+    return rank
+
+
+def get_world_size():
+    """
+    Gets total number of distributed workers or returns one if distributed is
+    not initialized.
+    """
+
+    if torch.distributed.is_available():
+        print("Torch distributed is available.")
+    else:
+        print("Torch distributed is not available.")
+
+    if torch.distributed.is_initialized():
+        print("Torch distributed is initialized.")
+    else:
+        print("Torch distributed is not initialized.")
+
+    if torch.distributed.is_available() and torch.distributed.is_initialized():
+        world_size = torch.distributed.get_world_size()
+    else:
+        world_size = 1
+    return world_size
+
+
+def is_main_process():
+    return get_rank() == 0
diff --git a/models/language_modeling/tensorflow/bert_large/training/bfloat16/optimization.py b/models/language_modeling/tensorflow/bert_large/training/bfloat16/optimization.py
index 6409f5c9e..d0b88b86c 100644
--- a/models/language_modeling/tensorflow/bert_large/training/bfloat16/optimization.py
+++ b/models/language_modeling/tensorflow/bert_large/training/bfloat16/optimization.py
@@ -22,7 +22,6 @@
 import tensorflow as tf
 
 
-
 def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, accum_steps=1, use_tpu=False, use_multi_cpu=0):
   """Creates an optimizer training op."""
   global_step = tf.compat.v1.train.get_or_create_global_step()
@@ -58,13 +57,25 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, accum_ste
   # It is recommended that you use this optimizer for fine tuning, since this
   # is how the model was trained (note that the Adam m/v variables are NOT
   # loaded from init_checkpoint.)
-  optimizer = AdamWeightDecayOptimizer(
-      learning_rate=learning_rate,
-      weight_decay_rate=0.01,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-6,
-      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+  use_itex_optimizer = False
+  try:
+    import intel_extension_for_tensorflow as itex
+    optimizer = itex.ops.AdamWithWeightDecayOptimizer(
+        learning_rate=learning_rate,
+        weight_decay_rate=0.01,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+    use_itex_optimizer = True
+  except:
+    optimizer = AdamWeightDecayOptimizer(
+        learning_rate=learning_rate,
+        weight_decay_rate=0.01,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 
   if use_multi_cpu and (accum_steps == 1):
     import horovod.tensorflow as hvd
@@ -140,9 +151,10 @@ def applyGrads(accum_vars, current_step):
     # Normally the global step update is done inside of `apply_gradients`.
     # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
     # a different optimizer, you should probably take this line out.
-    new_global_step = global_step + 1
-    new_global_step = tf.identity(new_global_step, name='global_step_update')
-    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+    if not use_itex_optimizer:
+      new_global_step = global_step + 1
+      new_global_step = tf.identity(new_global_step, name='global_step_update')
+      train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 
   return train_op
 
diff --git a/models/language_modeling/tensorflow/bert_large/training/bfloat16/run_pretraining.py b/models/language_modeling/tensorflow/bert_large/training/bfloat16/run_pretraining.py
index d8d21f635..932983a1b 100644
--- a/models/language_modeling/tensorflow/bert_large/training/bfloat16/run_pretraining.py
+++ b/models/language_modeling/tensorflow/bert_large/training/bfloat16/run_pretraining.py
@@ -25,6 +25,61 @@
 import generic_ops as bf
 import math
 
+import sys
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.training.session_run_hook import SessionRunArgs
+from tensorflow.python.training import training_util
+from tensorflow.python.platform import gfile
+from tensorflow.python.client import timeline
+from datetime import datetime
+
+class LoggerHook(tf.estimator.SessionRunHook):
+  """ Logs runtime. """
+  def __init__(self, batch_size, run_profile):
+    self.batch_size = batch_size
+    self.run_profile = run_profile
+
+  def begin(self):
+    self._step = 0
+    self._total_duration = 0
+    self._warmup = 2
+    self._global_step_tensor = training_util._get_or_create_global_step_read()
+
+  def before_run(self, run_context):
+    self._start_time = datetime.now()
+    opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
+    requests = {"global_step": self._global_step_tensor}
+    if self.run_profile:
+      return SessionRunArgs(requests, options=opts)
+    else:
+      return SessionRunArgs(requests)
+
+  def after_run(self, run_context, run_values):
+    self._step += 1
+    duration = datetime.now() - self._start_time
+    ms = duration.total_seconds() * 1000.00
+    if self._step > self._warmup:
+      self._total_duration += ms
+      if self._step % 1 == 0:
+        print("Current step: %d, time in ms: %.2f" %(self._step, ms))
+    else:
+      print("Warmup step: %d, time in ms: %.2f" %(self._step, ms))
+    sys.stdout.flush()
+    if self._step == 4 and self.run_profile:
+      with gfile.Open('timeline-bert.json', "w") as f:
+        trace = timeline.Timeline(run_values.run_metadata.step_stats)
+        f.write(trace.generate_chrome_trace_format())
+
+  def end(self, run_context):
+    print("self._step: %d" %self._step)
+    print("Total time spent (after warmup): %.2f ms" %(self._total_duration))
+    print("Time spent per iteration (after warmup): %.2f ms" %(self._total_duration/(self._step - self._warmup)))
+    time_takes = self._total_duration / (self._step - self._warmup)
+    if self.batch_size == 1:
+      print("Latency is %.3f ms" % (time_takes))
+    print("Throughput is %.3f samples/sec" % (self.batch_size * 1000 / time_takes))
+    sys.stdout.flush()
+
 global is_mpi
 try:
   import horovod.tensorflow as hvd
@@ -534,6 +589,13 @@ def main(_):
       inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
       intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads,
       allow_soft_placement=True)
+  if is_mpi:
+    gpus = tf.config.experimental.list_physical_devices('XPU')
+    for gpu in gpus:
+      tf.config.experimental.set_memory_growth(gpu, True)
+    if gpus:
+      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'XPU')
+    session_config.gpu_options.visible_device_list = str(hvd.local_rank())
   run_config = tf.compat.v1.estimator.tpu.RunConfig(
       cluster=tpu_cluster_resolver,
       master=FLAGS.master,
@@ -544,7 +606,7 @@ def main(_):
           iterations_per_loop=FLAGS.iterations_per_loop,
           num_shards=FLAGS.num_tpu_cores,
           per_host_input_for_training=is_per_host),
-      log_step_count_steps=25)
+      log_step_count_steps=1)
 
   if bert_config.precision == "bfloat16" :
     tf.compat.v1.logging.info("INFO: BERT bfloat16 training....!")
@@ -610,6 +672,7 @@ def main(_):
       tf.compat.v1.logging.info("***** Running training with profiler*****")
       hooks.append(tf.compat.v1.train.ProfilerHook(save_steps=3, output_dir=FLAGS.output_dir,
                                                    show_memory=False))
+    hooks.append(LoggerHook(FLAGS.train_batch_size, False))
 
     estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps,
                     hooks=hooks)
diff --git a/models/language_modeling/tensorflow/bert_large/training/fp32/generic_ops.py b/models/language_modeling/tensorflow/bert_large/training/fp32/generic_ops.py
index ad2baab3f..e79eaf7d5 100755
--- a/models/language_modeling/tensorflow/bert_large/training/fp32/generic_ops.py
+++ b/models/language_modeling/tensorflow/bert_large/training/fp32/generic_ops.py
@@ -21,7 +21,6 @@
 import numpy as np
 import tensorflow as tf
 
-
 _inprecision = tf.float32
 _rprecision = tf.float32
 _use_experimental_gelu = False
@@ -68,11 +67,16 @@ def softmax(scores, axis=None):
     return r_cast(rval)
 
 def layer_norm(inputs, begin_norm_axis, begin_params_axis, scope):
-    inputs = i_cast(inputs)
-    #lnorm = tf.keras.layers.LayerNormalization(axis=1, center=True, scale=True)
-    lnorm = tf.keras.layers.LayerNormalization()
-    out_tensor = lnorm(inputs)
-    return r_cast(out_tensor)
+    lnorm = tf.keras.layers.LayerNormalization(dtype=get_keras_policy())
+
+    # Try to use ITEX first
+    try:
+      import intel_extension_for_tensorflow as itex
+      lnorm = itex.ops.LayerNormalization(dtype=_rprecision)
+    except ImportError:
+      pass
+
+    return lnorm(inputs)
 
 "Moved from modeling.py"
 def gelu(x):
@@ -87,7 +91,16 @@ def gelu(x):
     `x` with the GELU activation applied.
   """
   if _use_experimental_gelu :
-    return tf.nn.gelu(features=x, approximate=True)
+    gelu_func = tf.nn.gelu
+
+    # Try to use ITXE first.
+    try:
+      import intel_extension_for_tensorflow as itex
+      gelu_func = itex.ops.gelu
+    except ImportError:
+      pass
+
+    return gelu_func(features=x, approximate=True)
   else:
     x = i_cast(x)
     cdf = 0.5 * (1.0 + tf.tanh(
diff --git a/models/language_modeling/tensorflow/bert_large/training/fp32/optimization.py b/models/language_modeling/tensorflow/bert_large/training/fp32/optimization.py
index 984f4a01a..d8af1467e 100644
--- a/models/language_modeling/tensorflow/bert_large/training/fp32/optimization.py
+++ b/models/language_modeling/tensorflow/bert_large/training/fp32/optimization.py
@@ -86,13 +86,25 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, accum_ste
   # It is recommended that you use this optimizer for fine tuning, since this
   # is how the model was trained (note that the Adam m/v variables are NOT
   # loaded from init_checkpoint.)
-  optimizer = AdamWeightDecayOptimizer(
-      learning_rate=learning_rate,
-      weight_decay_rate=0.01,
-      beta_1=0.9,
-      beta_2=0.999,
-      epsilon=1e-6,
-      exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+  use_itex_optimizer = False
+  try:
+    import intel_extension_for_tensorflow as itex
+    optimizer = itex.ops.AdamWithWeightDecayOptimizer(
+        learning_rate=learning_rate,
+        weight_decay_rate=0.01,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
+    use_itex_optimizer = True
+  except:
+    optimizer = AdamWeightDecayOptimizer(
+        learning_rate=learning_rate,
+        weight_decay_rate=0.01,
+        beta_1=0.9,
+        beta_2=0.999,
+        epsilon=1e-6,
+        exclude_from_weight_decay=["LayerNorm", "layer_norm", "bias"])
 
   if use_multi_cpu and (accum_steps == 1):
     import horovod.tensorflow as hvd
@@ -172,9 +184,10 @@ def applyGrads(accum_vars, current_step):
     # Normally the global step update is done inside of `apply_gradients`.
     # However, `AdamWeightDecayOptimizer` doesn't do this. But if you use
     # a different optimizer, you should probably take this line out.
-    new_global_step = global_step + 1
-    new_global_step = tf.identity(new_global_step, name='global_step_update')
-    train_op = tf.group(train_op, [global_step.assign(new_global_step)])
+    if not use_itex_optimizer:
+      new_global_step = global_step + 1
+      new_global_step = tf.identity(new_global_step, name='global_step_update')
+      train_op = tf.group(train_op, [global_step.assign(new_global_step)])
 
   return train_op
 
diff --git a/models/language_modeling/tensorflow/bert_large/training/fp32/run_pretraining.py b/models/language_modeling/tensorflow/bert_large/training/fp32/run_pretraining.py
index 62bc03c13..7659ccdfd 100644
--- a/models/language_modeling/tensorflow/bert_large/training/fp32/run_pretraining.py
+++ b/models/language_modeling/tensorflow/bert_large/training/fp32/run_pretraining.py
@@ -25,6 +25,61 @@
 import generic_ops as bf
 import math
 
+import sys
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.python.training.session_run_hook import SessionRunArgs
+from tensorflow.python.training import training_util
+from tensorflow.python.platform import gfile
+from tensorflow.python.client import timeline
+from datetime import datetime
+
+class LoggerHook(tf.estimator.SessionRunHook):
+  """ Logs runtime. """
+  def __init__(self, batch_size, run_profile):
+    self.batch_size = batch_size
+    self.run_profile = run_profile
+
+  def begin(self):
+    self._step = 0
+    self._total_duration = 0
+    self._warmup = 2
+    self._global_step_tensor = training_util._get_or_create_global_step_read()
+
+  def before_run(self, run_context):
+    self._start_time = datetime.now()
+    opts = config_pb2.RunOptions(trace_level=config_pb2.RunOptions.FULL_TRACE)
+    requests = {"global_step": self._global_step_tensor}
+    if self.run_profile:
+      return SessionRunArgs(requests, options=opts)
+    else:
+      return SessionRunArgs(requests)
+
+  def after_run(self, run_context, run_values):
+    self._step += 1
+    duration = datetime.now() - self._start_time
+    ms = duration.total_seconds() * 1000.00
+    if self._step > self._warmup:
+      self._total_duration += ms
+      if self._step % 1 == 0:
+        print("Current step: %d, time in ms: %.2f" %(self._step, ms))
+    else:
+      print("Warmup step: %d, time in ms: %.2f" %(self._step, ms))
+    sys.stdout.flush()
+    if self._step == 4 and self.run_profile:
+      with gfile.Open('timeline-bert.json', "w") as f:
+        trace = timeline.Timeline(run_values.run_metadata.step_stats)
+        f.write(trace.generate_chrome_trace_format())
+
+  def end(self, run_context):
+    print("self._step: %d" %self._step)
+    print("Total time spent (after warmup): %.2f ms" %(self._total_duration))
+    print("Time spent per iteration (after warmup): %.2f ms" %(self._total_duration/(self._step - self._warmup)))
+    time_takes = self._total_duration / (self._step - self._warmup)
+    if self.batch_size == 1:
+      print("Latency is %.3f ms" % (time_takes))
+    print("Throughput is %.3f samples/sec" % (self.batch_size * 1000 / time_takes))
+    sys.stdout.flush()
+
 global is_mpi
 try:
   import horovod.tensorflow as hvd
@@ -508,6 +563,13 @@ def main(_):
       inter_op_parallelism_threads=FLAGS.inter_op_parallelism_threads,
       intra_op_parallelism_threads=FLAGS.intra_op_parallelism_threads,
       allow_soft_placement=True)
+  if is_mpi:
+    gpus = tf.config.experimental.list_physical_devices('XPU')
+    for gpu in gpus:
+      tf.config.experimental.set_memory_growth(gpu, True)
+    if gpus:
+      tf.config.experimental.set_visible_devices(gpus[hvd.local_rank()], 'XPU')
+    session_config.gpu_options.visible_device_list = str(hvd.local_rank())
   run_config = tf.compat.v1.estimator.tpu.RunConfig(
       cluster=tpu_cluster_resolver,
       master=FLAGS.master,
@@ -518,7 +580,7 @@ def main(_):
           iterations_per_loop=FLAGS.iterations_per_loop,
           num_shards=FLAGS.num_tpu_cores,
           per_host_input_for_training=is_per_host),
-      log_step_count_steps=25)
+      log_step_count_steps=1)
 
   if bert_config.precision == "bfloat16" :
     tf.compat.v1.logging.info("INFO: BERT bfloat16 training....!")
@@ -584,6 +646,7 @@ def main(_):
       tf.compat.v1.logging.info("***** Running training with profiler*****")
       hooks.append(tf.compat.v1.train.ProfilerHook(save_steps=3, output_dir=FLAGS.output_dir,
                                                    show_memory=False))
+    hooks.append(LoggerHook(FLAGS.train_batch_size, False))
 
     estimator.train(input_fn=train_input_fn, max_steps=FLAGS.num_train_steps,
                     hooks=hooks)
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/LICENSE b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/LICENSE
new file mode 100644
index 000000000..868192b12
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2019 Hao Gao
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/eval_ssd.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/eval_ssd.py
new file mode 100644
index 000000000..b8792bfc3
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/eval_ssd.py
@@ -0,0 +1,430 @@
+import os
+import torch
+import torch.nn as nn
+import intel_extension_for_pytorch
+from vision.ssd.vgg_ssd import create_vgg_ssd, create_vgg_ssd_predictor
+from vision.ssd.mobilenetv1_ssd import create_mobilenetv1_ssd, create_mobilenetv1_ssd_predictor
+from vision.ssd.mobilenetv1_ssd_lite import create_mobilenetv1_ssd_lite, create_mobilenetv1_ssd_lite_predictor
+from vision.ssd.squeezenet_ssd_lite import create_squeezenet_ssd_lite, create_squeezenet_ssd_lite_predictor
+from vision.datasets.voc_dataset import VOCDataset
+from vision.datasets.open_images import OpenImagesDataset
+from vision.utils import box_utils, measurements
+from vision.utils.misc import str2bool, Timer
+import argparse
+import pathlib
+import numpy as np
+import logging
+import sys
+from vision.ssd.mobilenet_v2_ssd_lite import create_mobilenetv2_ssd_lite, create_mobilenetv2_ssd_lite_predictor
+from vision.ssd.data_preprocessing import PredictionTransform
+from vision.ssd.config import mobilenetv1_ssd_config as config
+from torch.quantization.quantize_jit import (
+convert_jit,
+prepare_jit,
+)
+from torch.quantization import default_qconfig
+from torch.jit._recursive import wrap_cpp_module
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+
+
+parser = argparse.ArgumentParser(description="SSD Evaluation on VOC Dataset.")
+parser.add_argument('--net', default="vgg16-ssd",
+                    help="The network architecture, it should be of mb1-ssd, \
+                          mb1-ssd-lite, mb2-ssd-lite or vgg16-ssd.")
+parser.add_argument("--trained_model", type=str)
+parser.add_argument("--dataset_type", default="voc", type=str,
+                    help='Specify dataset type. \
+                          Currently support voc and open_images.')
+parser.add_argument("--dataset", type=str,
+    help="The root directory of the VOC dataset or Open Images dataset.")
+parser.add_argument("--label_file", type=str, help="The label file path.")
+parser.add_argument("--use_cuda", type=str2bool, default=False)
+parser.add_argument("--use_xpu", type=str2bool, default=True)
+parser.add_argument('--fp16', action='store_true', help='Datatype used: fp16')
+parser.add_argument('--bf16', action='store_true', help='Datatype used: bf16')
+parser.add_argument("--use_2007_metric", type=str2bool, default=True)
+parser.add_argument("--nms_method", type=str, default="hard")
+parser.add_argument('--iter_num', default=4951, type=int,
+                    help='number of iteration')
+parser.add_argument("--iou_threshold", type=float, default=0.5,
+                    help="The threshold of Intersection over Union.")
+parser.add_argument("--eval_dir", default="eval_results", type=str,
+                    help="The directory to store evaluation results.")
+parser.add_argument('--mb2_width_mult', default=1.0, type=float,
+                    help='Width Multiplifier for MobilenetV2')
+parser.add_argument("--int8", action='store_true', help='use int8', default=False)
+parser.add_argument('--mc', default=False,
+                    help='do calibration with weight per channel quantization')
+parser.add_argument('--channels_last', action='store_true',
+                    help='Dataformat used: channel_last(plain NHWC)')
+parser.add_argument('--calib_num', default=8, type=int,
+                    help='number of calibration iteration')
+parser.add_argument('--dummy', default=0, type=int,
+                    help='dummy for perf')
+parser.add_argument('--batch_size', default=1, type=int,
+                    help='batch_size for dummy')
+parser.add_argument('--image_size', default=300, type=int,
+                    help='image_size for dummy')
+parser.add_argument('--benchmark', default=0, type=int, help='for benchmark performance, move H2D out of end2end calculation')
+parser.add_argument('--profile_iter', default=5, type=int, help='profile iter')
+parser.add_argument('--num-iterations', default=100, type=int, help='iterations for benchmark test')
+parser.add_argument("--save", help='Path to save entile model')
+parser.add_argument("--load", help='Path to load entile model')
+parser.add_argument('--jit', default=-1, type=int,choices=[-1, 0, 1],
+                     help='Select run with jit or impe path, 0 : impe, 1 : jit')
+args = parser.parse_args()
+
+if args.use_xpu:
+    DEVICE = torch.device("xpu")
+else:
+    DEVICE = torch.device(
+        "cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu")
+
+
+def group_annotation_by_class(dataset):
+    true_case_stat = {}
+    all_gt_boxes = {}
+    all_difficult_cases = {}
+    for i in range(len(dataset)):
+        image_id, annotation = dataset.get_annotation(i)
+        gt_boxes, classes, is_difficult = annotation
+        gt_boxes = torch.from_numpy(gt_boxes)
+        for i, difficult in enumerate(is_difficult):
+            class_index = int(classes[i])
+            gt_box = gt_boxes[i]
+            if not difficult:
+                true_case_stat[class_index] = true_case_stat.get(
+                    class_index, 0) + 1
+
+            if class_index not in all_gt_boxes:
+                all_gt_boxes[class_index] = {}
+            if image_id not in all_gt_boxes[class_index]:
+                all_gt_boxes[class_index][image_id] = []
+            all_gt_boxes[class_index][image_id].append(gt_box)
+            if class_index not in all_difficult_cases:
+                all_difficult_cases[class_index] = {}
+            if image_id not in all_difficult_cases[class_index]:
+                all_difficult_cases[class_index][image_id] = []
+            all_difficult_cases[class_index][image_id].append(difficult)
+
+    for class_index in all_gt_boxes:
+        for image_id in all_gt_boxes[class_index]:
+            all_gt_boxes[class_index][image_id] = torch.stack(
+                all_gt_boxes[class_index][image_id])
+    for class_index in all_difficult_cases:
+        for image_id in all_difficult_cases[class_index]:
+            all_gt_boxes[class_index][image_id] = torch.tensor(
+                all_gt_boxes[class_index][image_id])
+    return true_case_stat, all_gt_boxes, all_difficult_cases
+
+
+def compute_average_precision_per_class(num_true_cases, gt_boxes,
+                                        difficult_cases, prediction_file,
+                                        iou_threshold, use_2007_metric):
+    with open(prediction_file) as f:
+        image_ids = []
+        boxes = []
+        scores = []
+        for line in f:
+            t = line.rstrip().split(" ")
+            image_ids.append(t[0])
+            scores.append(float(t[1]))
+            box = torch.tensor([float(v) for v in t[2:]]).unsqueeze(0)
+            box -= 1.0  # convert to python format where indexes start from 0
+            boxes.append(box)
+        scores = np.array(scores)
+        sorted_indexes = np.argsort(-scores)
+        boxes = [boxes[i] for i in sorted_indexes]
+        image_ids = [image_ids[i] for i in sorted_indexes]
+        true_positive = np.zeros(len(image_ids))
+        false_positive = np.zeros(len(image_ids))
+        matched = set()
+        for i, image_id in enumerate(image_ids):
+            box = boxes[i]
+            if image_id not in gt_boxes:
+                false_positive[i] = 1
+                continue
+
+            gt_box = gt_boxes[image_id]
+            ious = box_utils.iou_of(box, gt_box)
+            max_iou = torch.max(ious).item()
+            max_arg = torch.argmax(ious).item()
+            if max_iou > iou_threshold:
+                if difficult_cases[image_id][max_arg] == 0:
+                    if (image_id, max_arg) not in matched:
+                        true_positive[i] = 1
+                        matched.add((image_id, max_arg))
+                    else:
+                        false_positive[i] = 1
+            else:
+                false_positive[i] = 1
+
+    true_positive = true_positive.cumsum()
+    false_positive = false_positive.cumsum()
+    precision = true_positive / (true_positive + false_positive)
+    recall = true_positive / num_true_cases
+    if use_2007_metric:
+        return measurements.compute_voc2007_average_precision(
+            precision, recall)
+    else:
+        return measurements.compute_average_precision(precision, recall)
+
+def jit(model, dataset):
+    print("run JIT path...")
+
+    model.eval()
+
+    transform = PredictionTransform(config.image_size, mean=config.image_mean, std=config.image_std)
+    image = dataset.get_image(0)
+    height, width, _ = image.shape
+    image = transform(image)
+    images = image.unsqueeze(0)
+    images = images.to("xpu")
+
+    if args.fp16:
+        images = images.half()
+
+    if args.benchmark == 1:
+        modelJit = torch.jit.trace(model, images, check_trace=False)
+    else:
+        modelJit = torch.jit.trace(model, images)
+
+    model = wrap_cpp_module(torch._C._jit_pass_fold_convbn(modelJit._c))
+    return model
+
+def calib(model, dataset):
+    print("Calibration for INT8 ... ")
+    with torch.no_grad():
+        if args.mc:
+            qconfig = torch.quantization.QConfig(
+                activation=torch.quantization.observer.MinMaxObserver.with_args(
+                    qscheme=torch.per_tensor_symmetric,
+                    reduce_range=False,
+                    dtype=torch.quint8
+                ),
+                weight=torch.quantization.default_per_channel_weight_observer
+            )
+        else:
+            qconfig = torch.quantization.QConfig(
+                activation=torch.quantization.observer.MinMaxObserver.with_args(
+                    qscheme=torch.per_tensor_symmetric,
+                    reduce_range=False,
+                    dtype=torch.quint8
+                ),
+                weight=torch.quantization.default_weight_observer
+            )
+
+        model = prepare_jit(model, {'': qconfig}, True)
+
+        # tune acc through iteration number for calibration
+        for i in range(args.calib_num):
+            image = dataset.get_image(i)
+            height, width, _ = image.shape
+            transform = PredictionTransform(config.image_size, mean=config.image_mean, std=config.image_std)
+            image = transform(image)
+            images = image.unsqueeze(0)
+            images = images.to("xpu")
+            model = model.to("xpu")
+            images = images.to("xpu")
+
+            model(images)
+
+        model = convert_jit(model, True)
+
+    return model
+
+
+
+if __name__ == '__main__':
+    pathlib.Path(args.eval_dir).mkdir(exist_ok=True)
+    eval_path = pathlib.Path(args.eval_dir + "/" + str(os.getpid()))
+    eval_path.mkdir(exist_ok=True)
+    timer = Timer()
+    class_names = [name.strip() for name in open(args.label_file).readlines()]
+
+    if args.dataset_type == "voc":
+        dataset = VOCDataset(args.dataset, args.iter_num, is_test=True)
+    elif args.dataset_type == 'open_images':
+        dataset = OpenImagesDataset(
+            args.dataset, args.iter_num, dataset_type="test")
+
+    true_case_stat, all_gb_boxes, all_difficult_cases = group_annotation_by_class(
+        dataset)
+
+    if args.int8 and args.jit == 0:
+        print("int8 not support impe path")
+        sys.exit(1) 
+
+    if args.load:
+        if os.path.isfile(args.load):
+            load_path = args.load
+            typeFlag = 1 if args.fp16 else 0
+            typeFlag = 2 if args.bf16 else typeFlag
+            channelsFlag = 1 if args.channels_last else 0
+            timer.start("Load Model")
+            if args.jit == -1:
+                if args.int8 or args.fp16:
+                    net = torch.jit.load(load_path)
+                else:
+                    net = torch.load(load_path)
+                    print("Running on", str(net.device))
+            elif args.jit == 0:
+                net = torch.load(load_path)
+                print("Running on", str(net.device))
+            else:
+                net = torch.jit.load(load_path)
+            print(f'It took {timer.end("Load Model")} seconds to load the model.')
+        else:
+            print("=> no saved model found at '{}'".format(args.load))
+    else:
+        if args.net == 'vgg16-ssd':
+            net = create_vgg_ssd(len(class_names), is_test=True)
+        elif args.net == 'mb1-ssd':
+            net = create_mobilenetv1_ssd(
+                len(class_names), is_test=True, device=DEVICE)
+        elif args.net == 'mb1-ssd-lite':
+            net = create_mobilenetv1_ssd_lite(len(class_names), is_test=True)
+        elif args.net == 'sq-ssd-lite':
+            net = create_squeezenet_ssd_lite(len(class_names), is_test=True)
+        elif args.net == 'mb2-ssd-lite':
+            net = create_mobilenetv2_ssd_lite(
+                len(class_names),
+                width_mult=args.mb2_width_mult,
+                is_test=True)
+        else:
+            logging.fatal(
+                "The net type is wrong. \
+                It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
+            parser.print_help(sys.stderr)
+            sys.exit(1)
+
+        timer.start("Load Model")
+        net.load(args.trained_model)
+
+        # convert the model to half exclude BatchNorm2d layer
+        # typeFlag 0 Float; typeFlag 1 Half; typeFlag 2 BFloat16
+        typeFlag = 0
+        if args.fp16 or args.bf16:
+            if args.fp16:
+                net.half()
+                typeFlag = 1
+            else:
+                net.bfloat16()
+                typeFlag = 2
+            for layer in net.modules():
+                if isinstance(layer, nn.BatchNorm2d):
+                    layer.float()
+
+        net = net.to(DEVICE)
+
+        if args.jit == -1:
+            if args.fp16 or args.int8:
+                net = jit(net, dataset)
+        elif args.jit == 1:
+            net = jit(net, dataset)
+            
+        if args.int8:
+            net = calib(net, dataset)
+
+        channelsFlag = 0
+        # channelsFlag 0 NCHW; channelsFlag 1 NHWC
+        if args.channels_last:
+            net = net.to(memory_format=torch.channels_last)
+            channelsFlag = 1
+        print("Running on", DEVICE)
+        print(f'It took {timer.end("Load Model")} seconds to load the model.')
+    if args.net == 'vgg16-ssd':
+        predictor = create_vgg_ssd_predictor(
+            net, nms_method=args.nms_method, device=DEVICE)
+    elif args.net == 'mb1-ssd':
+        predictor = create_mobilenetv1_ssd_predictor(
+            net, nms_method=args.nms_method, device=DEVICE)
+    elif args.net == 'mb1-ssd-lite':
+        predictor = create_mobilenetv1_ssd_lite_predictor(
+            net, nms_method=args.nms_method, device=DEVICE)
+    elif args.net == 'sq-ssd-lite':
+        predictor = create_squeezenet_ssd_lite_predictor(
+            net, nms_method=args.nms_method, device=DEVICE)
+    elif args.net == 'mb2-ssd-lite':
+        predictor = create_mobilenetv2_ssd_lite_predictor(
+            net, nms_method=args.nms_method, device=DEVICE)
+    else:
+        logging.fatal(
+            "The net type is wrong. \
+            It should be one of vgg16-ssd, mb1-ssd and mb1-ssd-lite.")
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+
+    results = []
+    for i in range(max(len(dataset), args.num_iterations)):
+        if args.benchmark and i > args.num_iterations:
+            break
+        print("process image", i)
+        timer.start("Load Image")
+        image = dataset.get_image(i) if args.dummy == 0 else None
+        print("Load Image: {:4f} seconds.".format(timer.end("Load Image")))
+        timer.start("Predict")
+
+        profiling = os.environ.get("PROFILE", "OFF").upper() in ["1", "Y", "ON", "YES", "TRUE"]
+        with torch.inference_mode():
+            boxes, labels, probs = predictor.predict(typeFlag, channelsFlag, image, -1, None, profiling, args, i)
+            if profiling and i == args.profile_iter:
+                break
+        # sync for time measurement
+        torch.xpu.synchronize()
+        print("Prediction: {:4f} seconds.".format(timer.end("Predict")))
+        indexes = torch.ones(labels.size(0), 1, dtype=torch.float32) * i
+        results.append(torch.cat([
+            indexes.reshape(-1, 1),
+            labels.reshape(-1, 1).float(),
+            probs.reshape(-1, 1),
+            boxes + 1.0  # matlab's indexes start from 1
+        ], dim=1))
+
+    if args.save:
+        store_path = args.save
+        if args.jit == -1:
+            if args.int8 or args.fp16:
+                torch.jit.save(net, store_path)
+            else:
+                torch.save(net, store_path)
+        elif args.jit == 0:
+            torch.save(net, store_path)
+        else:
+            torch.jit.save(net, store_path)          
+    if args.dummy > 0:
+        sys.exit(0)
+
+    results = torch.cat(results)
+    for class_index, class_name in enumerate(class_names):
+        if class_index == 0:
+            continue  # ignore background
+        prediction_path = eval_path / f"det_test_{class_name}.txt"
+        with open(prediction_path, "w") as f:
+            sub = results[results[:, 1] == class_index, :]
+            for i in range(sub.size(0)):
+                prob_box = sub[i, 2:].numpy()
+                image_id = dataset.ids[int(sub[i, 0])]
+                print(
+                    image_id + " " + " ".join([str(v) for v in prob_box]),
+                    file=f
+                )
+    aps = []
+    print("\n\nAverage Precision Per-class:")
+    for class_index, class_name in enumerate(class_names):
+        if class_index == 0:
+            continue
+        prediction_path = eval_path / f"det_test_{class_name}.txt"
+        ap = compute_average_precision_per_class(
+            true_case_stat[class_index],
+            all_gb_boxes[class_index],
+            all_difficult_cases[class_index],
+            prediction_path,
+            args.iou_threshold,
+            args.use_2007_metric
+        )
+        aps.append(ap)
+        print(f"{class_name}: {ap}")
+
+    print(f"\nAverage Precision Across All Classes:{sum(aps)/len(aps)}")
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/__init__.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/__init__.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/collation.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/collation.py
new file mode 100644
index 000000000..1162c426e
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/collation.py
@@ -0,0 +1,31 @@
+import torch
+import numpy as np
+
+
+def object_detection_collate(batch):
+    images = []
+    gt_boxes = []
+    gt_labels = []
+    image_type = type(batch[0][0])
+    box_type = type(batch[0][1])
+    label_type = type(batch[0][2])
+    for image, boxes, labels in batch:
+        if image_type is np.ndarray:
+            images.append(torch.from_numpy(image))
+        elif image_type is torch.Tensor:
+            images.append(image)
+        else:
+            raise TypeError(f"Image should be tensor or np.ndarray, but got {image_type}.")
+        if box_type is np.ndarray:
+            gt_boxes.append(torch.from_numpy(boxes))
+        elif box_type is torch.Tensor:
+            gt_boxes.append(boxes)
+        else:
+            raise TypeError(f"Boxes should be tensor or np.ndarray, but got {box_type}.")
+        if label_type is np.ndarray:
+            gt_labels.append(torch.from_numpy(labels))
+        elif label_type is torch.Tensor:
+            gt_labels.append(labels)
+        else:
+            raise TypeError(f"Labels should be tensor or np.ndarray, but got {label_type}.")
+    return torch.stack(images), gt_boxes, gt_labels
\ No newline at end of file
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/generate_vocdata.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/generate_vocdata.py
new file mode 100644
index 000000000..b6f7c5541
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/generate_vocdata.py
@@ -0,0 +1,127 @@
+import glob
+import sys
+import os
+import xml.etree.ElementTree as ET
+from random import random
+
+def main(filename):
+    # ratio to divide up the images
+    train = 0.7
+    val = 0.2
+    test = 0.1
+    if (train + test + val) != 1.0:
+        print("probabilities must equal 1")
+        exit()
+
+    # get the labels
+    labels = []
+    imgnames = []
+    annotations = {}
+
+    with open(filename, 'r') as labelfile:
+        label_string = ""
+        for line in labelfile:
+                label_string += line.rstrip()
+
+    labels = label_string.split(',')
+    labels  = [elem.replace(" ", "") for elem in labels]
+
+    # get image names
+    for filename in os.listdir("./JPEGImages"):
+        if filename.endswith(".jpg"):
+            img = filename.rstrip('.jpg')
+            imgnames.append(img)
+
+    print("Labels:", labels, "imgcnt:", len(imgnames))
+
+    # initialise annotation list
+    for label in labels:
+        annotations[label] = []
+
+    # Scan the annotations for the labels
+    for img in imgnames:
+        annote = "Annotations/" + img + '.xml'
+        if os.path.isfile(annote):
+            tree = ET.parse(annote)
+            root = tree.getroot()
+            annote_labels = []
+            for labelname in root.findall('*/name'):
+                labelname = labelname.text
+                annote_labels.append(labelname)
+                if labelname in labels:
+                    annotations[labelname].append(img)
+            annotations[img] = annote_labels
+        else:
+            print("Missing annotation for ", annote)
+            exit() 
+
+    # divvy up the images to the different sets
+    sampler = imgnames.copy()
+    train_list = []
+    val_list = []
+    test_list = []
+
+    while len(sampler) > 0:
+        dice = random()
+        elem = sampler.pop()
+
+        if dice <= test:
+            test_list.append(elem)
+        elif dice <= (test + val):
+            val_list.append(elem)
+        else:
+            train_list.append(elem) 
+
+    print("Training set:", len(train_list), "validation set:", len(val_list), "test set:", len(test_list))
+
+
+    # create the dataset files
+    create_folder("./ImageSets/Main/")
+    with open("./ImageSets/Main/train.txt", 'w') as outfile:
+        for name in train_list:
+            outfile.write(name + "\n")
+    with open("./ImageSets/Main/val.txt", 'w') as outfile:
+        for name in val_list:
+            outfile.write(name + "\n")
+    with open("./ImageSets/Main/trainval.txt", 'w') as outfile:
+        for name in train_list:
+            outfile.write(name + "\n")
+        for name in val_list:
+            outfile.write(name + "\n")
+
+    with open("./ImageSets/Main/test.txt", 'w') as outfile:
+        for name in test_list:
+            outfile.write(name + "\n")
+
+    # create the individiual files for each label
+    for label in labels:
+        with open("./ImageSets/Main/"+ label +"_train.txt", 'w') as outfile:
+            for name in train_list:
+                if label in annotations[name]:
+                    outfile.write(name + " 1\n")
+                else:
+                    outfile.write(name + " -1\n")
+        with open("./ImageSets/Main/"+ label +"_val.txt", 'w') as outfile:
+            for name in val_list:
+                if label in annotations[name]:
+                    outfile.write(name + " 1\n")
+                else:
+                    outfile.write(name + " -1\n")
+        with open("./ImageSets/Main/"+ label +"_test.txt", 'w') as outfile:
+            for name in test_list:
+                if label in annotations[name]:
+                    outfile.write(name + " 1\n")
+                else:
+                    outfile.write(name + " -1\n")
+
+def create_folder(foldername):
+    if os.path.exists(foldername):
+        print('folder already exists:', foldername)
+    else:
+        os.makedirs(foldername)
+
+if __name__=='__main__':
+    if len(sys.argv) < 2:
+        print("usage: python generate_vocdata.py <labelfile>")
+        exit()
+    main(sys.argv[1])
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/open_images.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/open_images.py
new file mode 100644
index 000000000..c2161ed56
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/open_images.py
@@ -0,0 +1,118 @@
+import numpy as np
+import pathlib
+import cv2
+import pandas as pd
+import copy
+
+class OpenImagesDataset:
+
+    def __init__(self, root, iter_num,
+                 transform=None, target_transform=None,
+                 dataset_type="train", balance_data=False):
+        self.root = pathlib.Path(root)
+        self.iter_num = iter_num
+        self.transform = transform
+        self.target_transform = target_transform
+        self.dataset_type = dataset_type.lower()
+
+        self.data, self.class_names, self.class_dict = self._read_data()
+        self.balance_data = balance_data
+        self.min_image_num = -1
+        if self.balance_data:
+            self.data = self._balance_data()
+        self.ids = [info['image_id'] for info in self.data]
+
+        self.class_stat = None
+
+    def _getitem(self, index):
+        image_info = self.data[index]
+        image = self._read_image(image_info['image_id'])
+        # duplicate boxes to prevent corruption of dataset
+        boxes = copy.copy(image_info['boxes'])
+        boxes[:, 0] *= image.shape[1]
+        boxes[:, 1] *= image.shape[0]
+        boxes[:, 2] *= image.shape[1]
+        boxes[:, 3] *= image.shape[0]
+        # duplicate labels to prevent corruption of dataset
+        labels = copy.copy(image_info['labels'])
+        if self.transform:
+            image, boxes, labels = self.transform(image, boxes, labels)
+        if self.target_transform:
+            boxes, labels = self.target_transform(boxes, labels)
+        return image_info['image_id'], image, boxes, labels
+
+    def __getitem__(self, index):
+        _, image, boxes, labels = self._getitem(index)
+        return image, boxes, labels
+
+    def get_annotation(self, index):
+        """To conform the eval_ssd implementation that is based on the VOC dataset."""
+        image_id, image, boxes, labels = self._getitem(index)
+        is_difficult = np.zeros(boxes.shape[0], dtype=np.uint8)
+        return image_id, (boxes, labels, is_difficult)
+
+    def get_image(self, index):
+        image_info = self.data[index]
+        image = self._read_image(image_info['image_id'])
+        if self.transform:
+            image, _ = self.transform(image)
+        return image
+
+    def _read_data(self):
+        annotation_file = f"{self.root}/sub-{self.dataset_type}-annotations-bbox.csv"
+        annotations = pd.read_csv(annotation_file)
+        class_names = ['BACKGROUND'] + sorted(list(annotations['ClassName'].unique()))
+        class_dict = {class_name: i for i, class_name in enumerate(class_names)}
+        data = []
+        for image_id, group in annotations.groupby("ImageID"):
+            boxes = group.loc[:, ["XMin", "YMin", "XMax", "YMax"]].values.astype(np.float32)
+            # make labels 64 bits to satisfy the cross_entropy function
+            labels = np.array([class_dict[name] for name in group["ClassName"]], dtype='int64')
+            data.append({
+                'image_id': image_id,
+                'boxes': boxes,
+                'labels': labels
+            })
+        return data, class_names, class_dict
+
+    def __len__(self):
+        return len(self.data) if self.iter_num > len(self.data) else self.iter_num
+
+    def __repr__(self):
+        if self.class_stat is None:
+            self.class_stat = {name: 0 for name in self.class_names[1:]}
+            for example in self.data:
+                for class_index in example['labels']:
+                    class_name = self.class_names[class_index]
+                    self.class_stat[class_name] += 1
+        content = ["Dataset Summary:"
+                   f"Number of Images: {len(self.data)}",
+                   f"Minimum Number of Images for a Class: {self.min_image_num}",
+                   "Label Distribution:"]
+        for class_name, num in self.class_stat.items():
+            content.append(f"\t{class_name}: {num}")
+        return "\n".join(content)
+
+    def _read_image(self, image_id):
+        image_file = self.root / self.dataset_type / f"{image_id}.jpg"
+        image = cv2.imread(str(image_file))
+        if image.shape[2] == 1:
+            image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
+        else:
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        return image
+
+    def _balance_data(self):
+        label_image_indexes = [set() for _ in range(len(self.class_names))]
+        for i, image in enumerate(self.data):
+            for label_id in image['labels']:
+                label_image_indexes[label_id].add(i)
+        label_stat = [len(s) for s in label_image_indexes]
+        self.min_image_num = min(label_stat[1:])
+        sample_image_indexes = set()
+        for image_indexes in label_image_indexes[1:]:
+            image_indexes = np.array(list(image_indexes))
+            sub = np.random.permutation(image_indexes)[:self.min_image_num]
+            sample_image_indexes.update(sub)
+        sample_data = [self.data[i] for i in sample_image_indexes]
+        return sample_data
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/voc_dataset.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/voc_dataset.py
new file mode 100644
index 000000000..e28790d7c
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/datasets/voc_dataset.py
@@ -0,0 +1,124 @@
+import numpy as np
+import logging
+import pathlib
+import xml.etree.ElementTree as ET
+import cv2
+import os
+
+
+class VOCDataset:
+
+    def __init__(self, root, iter_num, transform=None, target_transform=None, is_test=False, keep_difficult=False, label_file=None):
+        """Dataset for VOC data.
+        Args:
+            root: the root of the VOC2007 or VOC2012 dataset, the directory contains the following sub-directories:
+                Annotations, ImageSets, JPEGImages, SegmentationClass, SegmentationObject.
+        """
+        self.root = pathlib.Path(root)
+        self.iter_num = iter_num
+        self.transform = transform
+        self.target_transform = target_transform
+        if is_test:
+            image_sets_file = self.root / "ImageSets/Main/test.txt"
+        else:
+            image_sets_file = self.root / "ImageSets/Main/trainval.txt"
+        self.ids = VOCDataset._read_image_ids(image_sets_file)
+        self.keep_difficult = keep_difficult
+
+        # if the labels file exists, read in the class names
+        label_file_name = self.root / "labels.txt"
+
+        if os.path.isfile(label_file_name):
+            class_string = ""
+            with open(label_file_name, 'r') as infile:
+                for line in infile:
+                    class_string += line.rstrip()
+
+            # classes should be a comma separated list
+
+            classes = class_string.split(',')
+            # prepend BACKGROUND as first class
+            classes.insert(0, 'BACKGROUND')
+            classes  = [ elem.replace(" ", "") for elem in classes]
+            self.class_names = tuple(classes)
+            logging.info("VOC Labels read from file: " + str(self.class_names))
+
+        else:
+            logging.info("No labels file, using default VOC classes.")
+            self.class_names = ('BACKGROUND',
+            'aeroplane', 'bicycle', 'bird', 'boat',
+            'bottle', 'bus', 'car', 'cat', 'chair',
+            'cow', 'diningtable', 'dog', 'horse',
+            'motorbike', 'person', 'pottedplant',
+            'sheep', 'sofa', 'train', 'tvmonitor')
+
+
+        self.class_dict = {class_name: i for i, class_name in enumerate(self.class_names)}
+
+    def __getitem__(self, index):
+        image_id = self.ids[index]
+        boxes, labels, is_difficult = self._get_annotation(image_id)
+        if not self.keep_difficult:
+            boxes = boxes[is_difficult == 0]
+            labels = labels[is_difficult == 0]
+        image = self._read_image(image_id)
+        if self.transform:
+            image, boxes, labels = self.transform(image, boxes, labels)
+        if self.target_transform:
+            boxes, labels = self.target_transform(boxes, labels)
+        return image, boxes, labels
+
+    def get_image(self, index):
+        image_id = self.ids[index]
+        image = self._read_image(image_id)
+        if self.transform:
+            image, _ = self.transform(image)
+        return image
+
+    def get_annotation(self, index):
+        image_id = self.ids[index]
+        return image_id, self._get_annotation(image_id)
+
+    def __len__(self):
+        return len(self.ids) if self.iter_num > len(self.ids) else self.iter_num
+
+    @staticmethod
+    def _read_image_ids(image_sets_file):
+        ids = []
+        with open(image_sets_file) as f:
+            for line in f:
+                ids.append(line.rstrip())
+        return ids
+
+    def _get_annotation(self, image_id):
+        annotation_file = self.root / f"Annotations/{image_id}.xml"
+        objects = ET.parse(annotation_file).findall("object")
+        boxes = []
+        labels = []
+        is_difficult = []
+        for object in objects:
+            class_name = object.find('name').text.lower().strip()
+            # we're only concerned with clases in our list
+            if class_name in self.class_dict:
+                bbox = object.find('bndbox')
+
+                # VOC dataset format follows Matlab, in which indexes start from 0
+                x1 = float(bbox.find('xmin').text) - 1
+                y1 = float(bbox.find('ymin').text) - 1
+                x2 = float(bbox.find('xmax').text) - 1
+                y2 = float(bbox.find('ymax').text) - 1
+                boxes.append([x1, y1, x2, y2])
+
+                labels.append(self.class_dict[class_name])
+                is_difficult_str = object.find('difficult').text
+                is_difficult.append(int(is_difficult_str) if is_difficult_str else 0)
+
+        return (np.array(boxes, dtype=np.float32),
+                np.array(labels, dtype=np.int64),
+                np.array(is_difficult, dtype=np.uint8))
+
+    def _read_image(self, image_id):
+        image_file = self.root / f"JPEGImages/{image_id}.jpg"
+        image = cv2.imread(str(image_file))
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        return image
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/__init__.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/alexnet.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/alexnet.py
new file mode 100644
index 000000000..a91644daa
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/alexnet.py
@@ -0,0 +1,61 @@
+import torch.nn as nn
+import torch.utils.model_zoo as model_zoo
+
+# copied from torchvision (https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py).
+# The forward function is modified for model pruning.
+
+__all__ = ['AlexNet', 'alexnet']
+
+
+model_urls = {
+    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
+}
+
+
+class AlexNet(nn.Module):
+
+    def __init__(self, num_classes=1000):
+        super(AlexNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(64, 192, kernel_size=5, padding=2),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+            nn.Conv2d(192, 384, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(384, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.Conv2d(256, 256, kernel_size=3, padding=1),
+            nn.ReLU(inplace=True),
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.classifier = nn.Sequential(
+            nn.Dropout(),
+            nn.Linear(256 * 6 * 6, 4096),
+            nn.ReLU(inplace=True),
+            nn.Dropout(),
+            nn.Linear(4096, 4096),
+            nn.ReLU(inplace=True),
+            nn.Linear(4096, num_classes),
+        )
+
+    def forward(self, x):
+        x = self.features(x)
+        x = x.view(x.size(0), -1)
+        x = self.classifier(x)
+        return x
+
+
+def alexnet(pretrained=False, **kwargs):
+    r"""AlexNet model architecture from the
+    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = AlexNet(**kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['alexnet']))
+    return model
\ No newline at end of file
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/mobilenet.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/mobilenet.py
new file mode 100644
index 000000000..98300df83
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/mobilenet.py
@@ -0,0 +1,52 @@
+# borrowed from "https://github.com/marvis/pytorch-mobilenet"
+
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class MobileNetV1(nn.Module):
+    def __init__(self, num_classes=1024):
+        super(MobileNetV1, self).__init__()
+
+        def conv_bn(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+                nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True)
+            )
+
+        def conv_dw(inp, oup, stride):
+            return nn.Sequential(
+                nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False),
+                nn.BatchNorm2d(inp),
+                nn.ReLU(inplace=True),
+
+                nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+                nn.BatchNorm2d(oup),
+                nn.ReLU(inplace=True),
+            )
+
+        self.model = nn.Sequential(
+            conv_bn(3, 32, 2),
+            conv_dw(32, 64, 1),
+            conv_dw(64, 128, 2),
+            conv_dw(128, 128, 1),
+            conv_dw(128, 256, 2),
+            conv_dw(256, 256, 1),
+            conv_dw(256, 512, 2),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 512, 1),
+            conv_dw(512, 1024, 2),
+            conv_dw(1024, 1024, 1),
+        )
+        self.fc = nn.Linear(1024, num_classes)
+
+    def forward(self, x):
+        x = self.model(x)
+        x = F.avg_pool2d(x, 7)
+        x = x.view(-1, 1024)
+        x = self.fc(x)
+        return x
\ No newline at end of file
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/mobilenet_v2.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/mobilenet_v2.py
new file mode 100644
index 000000000..494fdad9d
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/mobilenet_v2.py
@@ -0,0 +1,173 @@
+import torch.nn as nn
+import math
+
+# Modified from https://github.com/tonylins/pytorch-mobilenet-v2/blob/master/MobileNetV2.py.
+# In this version, Relu6 is replaced with Relu to make it ONNX compatible.
+# BatchNorm Layer is optional to make it easy do batch norm confusion.
+
+
+def conv_bn(inp, oup, stride, use_batch_norm=True, onnx_compatible=False):
+    ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
+
+    if use_batch_norm:
+        return nn.Sequential(
+            nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+            nn.BatchNorm2d(oup),
+            ReLU(inplace=True)
+        )
+    else:
+        return nn.Sequential(
+            nn.Conv2d(inp, oup, 3, stride, 1, bias=False),
+            ReLU(inplace=True)
+        )
+
+
+def conv_1x1_bn(inp, oup, use_batch_norm=True, onnx_compatible=False):
+    ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
+    if use_batch_norm:
+        return nn.Sequential(
+            nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(oup),
+            ReLU(inplace=True)
+        )
+    else:
+        return nn.Sequential(
+            nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
+            ReLU(inplace=True)
+        )
+
+
+class InvertedResidual(nn.Module):
+    def __init__(self, inp, oup, stride, expand_ratio, use_batch_norm=True, onnx_compatible=False):
+        super(InvertedResidual, self).__init__()
+        ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
+
+        self.stride = stride
+        assert stride in [1, 2]
+
+        hidden_dim = round(inp * expand_ratio)
+        self.use_res_connect = self.stride == 1 and inp == oup
+
+        if expand_ratio == 1:
+            if use_batch_norm:
+                self.conv = nn.Sequential(
+                    # dw
+                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                    nn.BatchNorm2d(hidden_dim),
+                    ReLU(inplace=True),
+                    # pw-linear
+                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                    nn.BatchNorm2d(oup),
+                )
+            else:
+                self.conv = nn.Sequential(
+                    # dw
+                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                    ReLU(inplace=True),
+                    # pw-linear
+                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                )
+        else:
+            if use_batch_norm:
+                self.conv = nn.Sequential(
+                    # pw
+                    nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                    nn.BatchNorm2d(hidden_dim),
+                    ReLU(inplace=True),
+                    # dw
+                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                    nn.BatchNorm2d(hidden_dim),
+                    ReLU(inplace=True),
+                    # pw-linear
+                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                    nn.BatchNorm2d(oup),
+                )
+            else:
+                self.conv = nn.Sequential(
+                    # pw
+                    nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
+                    ReLU(inplace=True),
+                    # dw
+                    nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
+                    ReLU(inplace=True),
+                    # pw-linear
+                    nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
+                )
+
+    def forward(self, x):
+        if self.use_res_connect:
+            return x + self.conv(x)
+        else:
+            return self.conv(x)
+
+
+class MobileNetV2(nn.Module):
+    def __init__(self, n_class=1000, input_size=224, width_mult=1., dropout_ratio=0.2,
+                 use_batch_norm=True, onnx_compatible=False):
+        super(MobileNetV2, self).__init__()
+        block = InvertedResidual
+        input_channel = 32
+        last_channel = 1280
+        interverted_residual_setting = [
+            # t, c, n, s
+            [1, 16, 1, 1],
+            [6, 24, 2, 2],
+            [6, 32, 3, 2],
+            [6, 64, 4, 2],
+            [6, 96, 3, 1],
+            [6, 160, 3, 2],
+            [6, 320, 1, 1],
+        ]
+
+        # building first layer
+        assert input_size % 32 == 0
+        input_channel = int(input_channel * width_mult)
+        self.last_channel = int(last_channel * width_mult) if width_mult > 1.0 else last_channel
+        self.features = [conv_bn(3, input_channel, 2, onnx_compatible=onnx_compatible)]
+        # building inverted residual blocks
+        for t, c, n, s in interverted_residual_setting:
+            output_channel = int(c * width_mult)
+            for i in range(n):
+                if i == 0:
+                    self.features.append(block(input_channel, output_channel, s,
+                                               expand_ratio=t, use_batch_norm=use_batch_norm,
+                                               onnx_compatible=onnx_compatible))
+                else:
+                    self.features.append(block(input_channel, output_channel, 1,
+                                               expand_ratio=t, use_batch_norm=use_batch_norm,
+                                               onnx_compatible=onnx_compatible))
+                input_channel = output_channel
+        # building last several layers
+        self.features.append(conv_1x1_bn(input_channel, self.last_channel,
+                                         use_batch_norm=use_batch_norm, onnx_compatible=onnx_compatible))
+        # make it nn.Sequential
+        self.features = nn.Sequential(*self.features)
+
+        # building classifier
+        self.classifier = nn.Sequential(
+            nn.Dropout(dropout_ratio),
+            nn.Linear(self.last_channel, n_class),
+        )
+
+        self._initialize_weights()
+
+    def forward(self, x):
+        x = self.features(x)
+        x = x.mean(3).mean(2)
+        x = self.classifier(x)
+        return x
+
+    def _initialize_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+                if m.bias is not None:
+                    m.bias.data.zero_()
+            elif isinstance(m, nn.BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+            elif isinstance(m, nn.Linear):
+                n = m.weight.size(1)
+                m.weight.data.normal_(0, 0.01)
+                m.bias.data.zero_()
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/multibox_loss.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/multibox_loss.py
new file mode 100644
index 000000000..2351c7607
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/multibox_loss.py
@@ -0,0 +1,47 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import torch
+
+
+from ..utils import box_utils
+
+
+class MultiboxLoss(nn.Module):
+    def __init__(self, priors, iou_threshold, neg_pos_ratio,
+                 center_variance, size_variance, device):
+        """Implement SSD Multibox Loss.
+
+        Basically, Multibox loss combines classification loss
+         and Smooth L1 regression loss.
+        """
+        super(MultiboxLoss, self).__init__()
+        self.iou_threshold = iou_threshold
+        self.neg_pos_ratio = neg_pos_ratio
+        self.center_variance = center_variance
+        self.size_variance = size_variance
+        self.priors = priors
+        self.priors.to(device)
+
+    def forward(self, confidence, predicted_locations, labels, gt_locations):
+        """Compute classification loss and smooth l1 loss.
+
+        Args:
+            confidence (batch_size, num_priors, num_classes): class predictions.
+            locations (batch_size, num_priors, 4): predicted locations.
+            labels (batch_size, num_priors): real labels of all the priors.
+            boxes (batch_size, num_priors, 4): real boxes corresponding all the priors.
+        """
+        num_classes = confidence.size(2)
+        with torch.no_grad():
+            # derived from cross_entropy=sum(log(p))
+            loss = -F.log_softmax(confidence, dim=2)[:, :, 0]
+            mask = box_utils.hard_negative_mining(loss, labels, self.neg_pos_ratio)
+
+        confidence = confidence[mask, :]
+        classification_loss = F.cross_entropy(confidence.reshape(-1, num_classes), labels[mask], size_average=False)
+        pos_mask = labels > 0
+        predicted_locations = predicted_locations[pos_mask, :].reshape(-1, 4)
+        gt_locations = gt_locations[pos_mask, :].reshape(-1, 4)
+        smooth_l1_loss = F.smooth_l1_loss(predicted_locations, gt_locations, size_average=False)
+        num_pos = gt_locations.size(0)
+        return smooth_l1_loss/num_pos, classification_loss/num_pos
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/scaled_l2_norm.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/scaled_l2_norm.py
new file mode 100644
index 000000000..c1fd642e8
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/scaled_l2_norm.py
@@ -0,0 +1,19 @@
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+
+
+class ScaledL2Norm(nn.Module):
+    def __init__(self, in_channels, initial_scale):
+        super(ScaledL2Norm, self).__init__()
+        self.in_channels = in_channels
+        self.scale = nn.Parameter(torch.Tensor(in_channels))
+        self.initial_scale = initial_scale
+        self.reset_parameters()
+
+    def forward(self, x):
+        return (F.normalize(x, p=2, dim=1)
+                * self.scale.unsqueeze(0).unsqueeze(2).unsqueeze(3))
+
+    def reset_parameters(self):
+        self.scale.data.fill_(self.initial_scale)
\ No newline at end of file
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/squeezenet.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/squeezenet.py
new file mode 100644
index 000000000..d96167810
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/squeezenet.py
@@ -0,0 +1,130 @@
+import math
+import torch
+import torch.nn as nn
+import torch.nn.init as init
+import torch.utils.model_zoo as model_zoo
+
+
+__all__ = ['SqueezeNet', 'squeezenet1_0', 'squeezenet1_1']
+
+
+model_urls = {
+    'squeezenet1_0': 'https://download.pytorch.org/models/squeezenet1_0-a815701f.pth',
+    'squeezenet1_1': 'https://download.pytorch.org/models/squeezenet1_1-f364aa15.pth',
+}
+
+
+class Fire(nn.Module):
+
+    def __init__(self, inplanes, squeeze_planes,
+                 expand1x1_planes, expand3x3_planes):
+        super(Fire, self).__init__()
+        self.inplanes = inplanes
+        self.squeeze = nn.Conv2d(inplanes, squeeze_planes, kernel_size=1)
+        self.squeeze_activation = nn.ReLU(inplace=True)
+        self.expand1x1 = nn.Conv2d(squeeze_planes, expand1x1_planes,
+                                   kernel_size=1)
+        self.expand1x1_activation = nn.ReLU(inplace=True)
+        self.expand3x3 = nn.Conv2d(squeeze_planes, expand3x3_planes,
+                                   kernel_size=3, padding=1)
+        self.expand3x3_activation = nn.ReLU(inplace=True)
+
+    def forward(self, x):
+        x = self.squeeze_activation(self.squeeze(x))
+        return torch.cat([
+            self.expand1x1_activation(self.expand1x1(x)),
+            self.expand3x3_activation(self.expand3x3(x))
+        ], 1)
+
+
+class SqueezeNet(nn.Module):
+
+    def __init__(self, version=1.0, num_classes=1000):
+        super(SqueezeNet, self).__init__()
+        if version not in [1.0, 1.1]:
+            raise ValueError("Unsupported SqueezeNet version {version}:"
+                             "1.0 or 1.1 expected".format(version=version))
+        self.num_classes = num_classes
+        if version == 1.0:
+            self.features = nn.Sequential(
+                nn.Conv2d(3, 96, kernel_size=7, stride=2),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(96, 16, 64, 64),
+                Fire(128, 16, 64, 64),
+                Fire(128, 32, 128, 128),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(256, 32, 128, 128),
+                Fire(256, 48, 192, 192),
+                Fire(384, 48, 192, 192),
+                Fire(384, 64, 256, 256),
+                nn.MaxPool2d(kernel_size=3, stride=2, ceil_mode=True),
+                Fire(512, 64, 256, 256),
+            )
+        else:
+            self.features = nn.Sequential(
+                nn.Conv2d(3, 64, kernel_size=3, stride=2),
+                nn.ReLU(inplace=True),
+                nn.MaxPool2d(kernel_size=3, stride=2),
+                Fire(64, 16, 64, 64),
+                Fire(128, 16, 64, 64),
+                nn.MaxPool2d(kernel_size=3, stride=2),
+                Fire(128, 32, 128, 128),
+                Fire(256, 32, 128, 128),
+                nn.MaxPool2d(kernel_size=3, stride=2),
+                Fire(256, 48, 192, 192),
+                Fire(384, 48, 192, 192),
+                Fire(384, 64, 256, 256),
+                Fire(512, 64, 256, 256),
+            )
+        # Final convolution is initialized differently form the rest
+        final_conv = nn.Conv2d(512, self.num_classes, kernel_size=1)
+        self.classifier = nn.Sequential(
+            nn.Dropout(p=0.5),
+            final_conv,
+            nn.ReLU(inplace=True),
+            nn.AvgPool2d(13, stride=1)
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                if m is final_conv:
+                    init.normal_(m.weight, mean=0.0, std=0.01)
+                else:
+                    init.kaiming_uniform_(m.weight)
+                if m.bias is not None:
+                    init.constant_(m.bias, 0)
+
+    def forward(self, x):
+        x = self.features(x)
+        x = self.classifier(x)
+        return x.view(x.size(0), self.num_classes)
+
+
+def squeezenet1_0(pretrained=False, **kwargs):
+    r"""SqueezeNet model architecture from the `"SqueezeNet: AlexNet-level
+    accuracy with 50x fewer parameters and <0.5MB model size"
+    <https://arxiv.org/abs/1602.07360>`_ paper.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = SqueezeNet(version=1.0, **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['squeezenet1_0']))
+    return model
+
+
+def squeezenet1_1(pretrained=False, **kwargs):
+    r"""SqueezeNet 1.1 model from the `official SqueezeNet repo
+    <https://github.com/DeepScale/SqueezeNet/tree/master/SqueezeNet_v1.1>`_.
+    SqueezeNet 1.1 has 2.4x less computation and slightly fewer parameters
+    than SqueezeNet 1.0, without sacrificing accuracy.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on ImageNet
+    """
+    model = SqueezeNet(version=1.1, **kwargs)
+    if pretrained:
+        model.load_state_dict(model_zoo.load_url(model_urls['squeezenet1_1']))
+    return model
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/vgg.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/vgg.py
new file mode 100644
index 000000000..7043c21d8
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/nn/vgg.py
@@ -0,0 +1,25 @@
+import torch.nn as nn
+
+
+# borrowed from https://github.com/amdegroot/ssd.pytorch/blob/master/ssd.py
+def vgg(cfg, batch_norm=False):
+    layers = []
+    in_channels = 3
+    for v in cfg:
+        if v == 'M':
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+        elif v == 'C':
+            layers += [nn.MaxPool2d(kernel_size=2, stride=2, ceil_mode=True)]
+        else:
+            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            if batch_norm:
+                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU(inplace=True)]
+            else:
+                layers += [conv2d, nn.ReLU(inplace=True)]
+            in_channels = v
+    pool5 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
+    conv6 = nn.Conv2d(512, 1024, kernel_size=3, padding=6, dilation=6)
+    conv7 = nn.Conv2d(1024, 1024, kernel_size=1)
+    layers += [pool5, conv6,
+               nn.ReLU(inplace=True), conv7, nn.ReLU(inplace=True)]
+    return layers
\ No newline at end of file
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/prunning/__init__.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/prunning/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/prunning/prunner.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/prunning/prunner.py
new file mode 100644
index 000000000..41da14f0f
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/prunning/prunner.py
@@ -0,0 +1,233 @@
+import torch
+import torch.nn as nn
+import logging
+from heapq import nsmallest
+
+from ..utils.model_book import ModelBook
+
+
+class ModelPrunner:
+    def __init__(self, model, train_fun, ignored_paths=[]):
+        """ Implement the pruning algorithm described in the paper https://arxiv.org/pdf/1611.06440.pdf .
+
+        The prunning criteria is dC/dh * h, while C is the cost, h is the activation.
+        """
+        self.model = model
+        self.train_fun = train_fun
+        self.ignored_paths = ignored_paths
+        self.book = ModelBook(self.model)
+        self.outputs = {}
+        self.grads = {}
+        self.handles = []
+        self.decendent_batch_norms = {}  # descendants impacted by the conv layers.
+        self.last_conv_path = None    # used to trace the graph
+        self.descendent_convs = {}    # descendants impacted by the conv layers.
+        self.descendent_linears = {}  # descendants impacted by the linear layers.
+        self.last_linear_path = None  # used to trace the graph
+
+    def _make_new_conv(self, conv, filter_index, channel_type="out"):
+        if not isinstance(conv, nn.Conv2d):
+            raise TypeError(f"The module is not Conv2d, but {type(conv)}.")
+
+        if channel_type == "out":
+            new_conv = nn.Conv2d(conv.in_channels, conv.out_channels - 1, conv.kernel_size, conv.stride,
+                                 conv.padding, conv.dilation, conv.groups, conv.bias is not None)
+            mask = torch.ones(conv.out_channels, dtype=torch.uint8)
+            mask[filter_index] = 0
+            new_conv.weight.data = conv.weight.data[mask, :, :, :]
+            if conv.bias is not None:
+                new_conv.bias.data = conv.bias.data[mask]
+
+        elif channel_type == 'in':
+            new_conv = nn.Conv2d(conv.in_channels - 1, conv.out_channels, conv.kernel_size, conv.stride,
+                                 conv.padding, conv.dilation, conv.groups, conv.bias is not None)
+            mask = torch.ones(conv.in_channels, dtype=torch.uint8)
+            mask[filter_index] = 0
+            new_conv.weight.data = conv.weight.data[:, mask, :, :]
+            if conv.bias is not None:
+                new_conv.bias.data = conv.bias.data
+        else:
+            raise ValueError(f"{channel_type} should be either 'in' or 'out'.")
+        return new_conv
+
+    def remove_conv_filter(self, path, filter_index):
+        conv = self.book.get_module(path)
+        logging.info(f'Prune Conv: {"/".join(path)}, Filter: {filter_index}, Layer: {conv}')
+        new_conv = self._make_new_conv(conv, filter_index, channel_type="out")
+        self._update_model(path, new_conv)
+
+        next_conv_path = self.descendent_convs.get(path)
+        if next_conv_path:
+            next_conv = self.book.get_module(next_conv_path)
+            new_next_conv = self._make_new_conv(next_conv, filter_index, channel_type="in")
+            self._update_model(next_conv_path, new_next_conv)
+
+        # reduce the num_features of batch norm
+        batch_norm_path = self.decendent_batch_norms.get(path)
+        if batch_norm_path:
+            batch_norm = self.book.get_module(batch_norm_path)
+            new_batch_norm = nn.BatchNorm2d(batch_norm.num_features - 1)
+            self._update_model(batch_norm_path, new_batch_norm)
+
+        # reduce the in channels of linear layer
+        linear_path = self.descendent_linears.get(path)
+        if linear_path:
+            linear = self.book.get_module(linear_path)
+            new_linear = self._make_new_linear(linear, filter_index, conv, channel_type="in")
+            self._update_model(linear_path, new_linear)
+
+    @staticmethod
+    def _make_new_linear(linear, feature_index, conv=None, channel_type="out"):
+        if channel_type == "out":
+            new_linear = nn.Linear(linear.in_features, linear.out_features - 1,
+                                   bias=linear.bias is not None)
+            mask = torch.ones(linear.out_features, dtype=torch.uint8)
+            mask[feature_index] = 0
+            new_linear.weight.data = linear.weight.data[mask, :]
+            if linear.bias is not None:
+                new_linear.bias.data = linear.bias.data[mask]
+        elif channel_type == "in":
+            if conv:
+                block = int(linear.in_features / conv.out_channels)
+            else:
+                block = 1
+            new_linear = nn.Linear(linear.in_features - block, linear.out_features,
+                                   bias=linear.bias is not None)
+            start_index = feature_index * block
+            end_index = (feature_index + 1) * block
+            mask = torch.ones(linear.in_features, dtype=torch.uint8)
+            mask[start_index: end_index] = 0
+            new_linear.weight.data = linear.weight.data[:, mask]
+            if linear.bias is not None:
+                new_linear.bias.data = linear.bias.data
+        else:
+            raise ValueError(f"{channel_type} should be either 'in' or 'out'.")
+        return new_linear
+
+    def prune_conv_layers(self, num=1):
+        """Prune one conv2d filter.
+        """
+        self.register_conv_hooks()
+        before_loss, before_accuracy = self.train_fun(self.model)
+        ranks = []
+        for path, output in self.outputs.items():
+            output = output.data
+            grad = self.grads[path].data
+            v = grad * output
+            v = v.sum(0).sum(1).sum(1)  # sum to the channel axis.
+            v = torch.abs(v)
+            v = v / torch.sqrt(torch.sum(v * v))  # normalize
+            for i, e in enumerate(v):
+                ranks.append((path, i, e))
+        to_prune = nsmallest(num, ranks, key=lambda t: t[2])
+        to_prune = sorted(to_prune, key=lambda t: (t[0], -t[1]))  # prune the filters with bigger indexes first to avoid rearrangement.
+        for path, filter_index, value in to_prune:
+            self.remove_conv_filter(path, filter_index)
+        self.deregister_hooks()
+        after_loss, after_accuracy = self.train_fun(self.model)
+        return after_loss - before_loss, after_accuracy - before_accuracy
+
+    def register_conv_hooks(self):
+        """Run register before training for pruning."""
+        self.outputs.clear()
+        self.grads.clear()
+        self.handles.clear()
+        self.last_conv_path = None
+        self.decendent_batch_norms.clear()
+        self.descendent_convs.clear()
+        self.descendent_linears.clear()
+
+        def forward_hook(m, input, output):
+            path = self.book.get_path(m)
+            if isinstance(m, nn.Conv2d):
+                if path not in self.ignored_paths:
+                    self.outputs[path] = output
+                if self.last_conv_path:
+                    self.descendent_convs[self.last_conv_path] = path
+                self.last_conv_path = path
+            elif isinstance(m, nn.BatchNorm2d):
+                if self.last_conv_path:
+                    self.decendent_batch_norms[self.last_conv_path] = path
+            elif isinstance(m, nn.Linear):
+                if self.last_conv_path:
+                    self.descendent_linears[self.last_conv_path] = path
+                self.last_conv_path = None  # after a linear layer the conv layer doesn't matter
+
+        def backward_hook(m, input, output):
+            path = self.book.get_path(m)
+            self.grads[path] = output[0]
+
+        for path, m in self.book.modules(module_type=(nn.Conv2d, nn.BatchNorm2d, nn.Linear)):
+            h = m.register_forward_hook(forward_hook)
+            self.handles.append(h)
+            h = m.register_backward_hook(backward_hook)
+            self.handles.append(h)
+
+    def deregister_hooks(self):
+        """Run degresiter before retraining to recover the model"""
+        for handle in self.handles:
+            handle.remove()
+
+    def prune_linear_layers(self, num=1):
+        self.register_linear_hooks()
+        before_loss, before_accuracy = self.train_fun(self.model)
+        ranks = []
+        for path, output in self.outputs.items():
+            output = output.data
+            grad = self.grads[path].data
+            v = grad * output
+            v = v.sum(0)  # sum to the channel axis.
+            v = torch.abs(v)
+            v = v / torch.sqrt(torch.sum(v * v))  # normalize
+            for i, e in enumerate(v):
+                ranks.append((path, i, e))
+        to_prune = nsmallest(num, ranks, key=lambda t: t[2])
+        to_prune = sorted(to_prune, key=lambda t: (t[0], -t[1]))
+        for path, feature_index, value in to_prune:
+            self.remove_linear_feature(path, feature_index)
+        self.deregister_hooks()
+        after_loss, after_accuracy = self.train_fun(self.model)
+        return after_loss - before_loss, after_accuracy - before_accuracy
+
+    def register_linear_hooks(self):
+        self.outputs.clear()
+        self.grads.clear()
+        self.handles.clear()
+        self.descendent_linears.clear()
+        self.last_linear_path = None
+
+        def forward_hook(m, input, output):
+            path = self.book.get_path(m)
+            if path not in self.ignored_paths:
+                self.outputs[path] = output
+            if self.last_linear_path:
+                self.descendent_linears[self.last_linear_path] = path
+            self.last_linear_path = path
+
+        def backward_hook(m, input, output):
+            path = self.book.get_path(m)
+            self.grads[path] = output[0]
+
+        for _, m in self.book.linear_modules():
+            h = m.register_forward_hook(forward_hook)
+            self.handles.append(h)
+            h = m.register_backward_hook(backward_hook)
+            self.handles.append(h)
+
+    def remove_linear_feature(self, path, feature_index):
+        linear = self.book.get_module(path)
+        logging.info(f'Prune Linear: {"/".join(path)}, Filter: {feature_index}, Layer: {linear}')
+        new_linear = self._make_new_linear(linear, feature_index, channel_type="out")
+        self._update_model(path, new_linear)
+
+        # update following linear layers
+        next_linear_path = self.descendent_linears.get(path)
+        if next_linear_path:
+            next_linear = self.book.get_module(next_linear_path)
+            new_next_linear = self._make_new_linear(next_linear, feature_index, channel_type='in')
+            self._update_model(next_linear_path, new_next_linear)
+
+    def _update_model(self, path, module):
+        parent = self.book.get_module(path[:-1])
+        parent._modules[path[-1]] = module
+        self.book.update(path, module)
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/__init__.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/__init__.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/mobilenetv1_ssd_config.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/mobilenetv1_ssd_config.py
new file mode 100644
index 000000000..73a9e0e08
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/mobilenetv1_ssd_config.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+from vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
+
+
+image_size = 300
+image_mean = np.array([127, 127, 127])  # RGB layout
+image_std = 128.0
+iou_threshold = 0.45
+center_variance = 0.1
+size_variance = 0.2
+
+specs = [
+    SSDSpec(19, 16, SSDBoxSizes(60, 105), [2, 3]),
+    SSDSpec(10, 32, SSDBoxSizes(105, 150), [2, 3]),
+    SSDSpec(5, 64, SSDBoxSizes(150, 195), [2, 3]),
+    SSDSpec(3, 100, SSDBoxSizes(195, 240), [2, 3]),
+    SSDSpec(2, 150, SSDBoxSizes(240, 285), [2, 3]),
+    SSDSpec(1, 300, SSDBoxSizes(285, 330), [2, 3])
+]
+
+
+priors = generate_ssd_priors(specs, image_size)
\ No newline at end of file
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/squeezenet_ssd_config.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/squeezenet_ssd_config.py
new file mode 100644
index 000000000..111383c63
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/squeezenet_ssd_config.py
@@ -0,0 +1,23 @@
+import numpy as np
+
+from vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
+
+
+image_size = 300
+image_mean = np.array([127, 127, 127])  # RGB layout
+image_std = 128.0
+iou_threshold = 0.45
+center_variance = 0.1
+size_variance = 0.2
+
+specs = [
+    SSDSpec(17, 16, SSDBoxSizes(60, 105), [2, 3]),
+    SSDSpec(10, 32, SSDBoxSizes(105, 150), [2, 3]),
+    SSDSpec(5, 64, SSDBoxSizes(150, 195), [2, 3]),
+    SSDSpec(3, 100, SSDBoxSizes(195, 240), [2, 3]),
+    SSDSpec(2, 150, SSDBoxSizes(240, 285), [2, 3]),
+    SSDSpec(1, 300, SSDBoxSizes(285, 330), [2, 3])
+]
+
+
+priors = generate_ssd_priors(specs, image_size)
\ No newline at end of file
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/vgg_ssd_config.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/vgg_ssd_config.py
new file mode 100644
index 000000000..a4d3de6de
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/config/vgg_ssd_config.py
@@ -0,0 +1,24 @@
+import numpy as np
+
+from vision.utils.box_utils import SSDSpec, SSDBoxSizes, generate_ssd_priors
+
+
+image_size = 300
+image_mean = np.array([123, 117, 104])  # RGB layout
+image_std = 1.0
+
+iou_threshold = 0.45
+center_variance = 0.1
+size_variance = 0.2
+
+specs = [
+    SSDSpec(38, 8, SSDBoxSizes(30, 60), [2]),
+    SSDSpec(19, 16, SSDBoxSizes(60, 111), [2, 3]),
+    SSDSpec(10, 32, SSDBoxSizes(111, 162), [2, 3]),
+    SSDSpec(5, 64, SSDBoxSizes(162, 213), [2, 3]),
+    SSDSpec(3, 100, SSDBoxSizes(213, 264), [2]),
+    SSDSpec(1, 300, SSDBoxSizes(264, 315), [2])
+]
+
+
+priors = generate_ssd_priors(specs, image_size)
\ No newline at end of file
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/data_preprocessing.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/data_preprocessing.py
new file mode 100644
index 000000000..ca79fed87
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/data_preprocessing.py
@@ -0,0 +1,62 @@
+from ..transforms.transforms import *
+
+
+class TrainAugmentation:
+    def __init__(self, size, mean=0, std=1.0):
+        """
+        Args:
+            size: the size the of final image.
+            mean: mean pixel value per channel.
+        """
+        self.mean = mean
+        self.size = size
+        self.augment = Compose([
+            ConvertFromInts(),
+            PhotometricDistort(),
+            Expand(self.mean),
+            RandomSampleCrop(),
+            RandomMirror(),
+            ToPercentCoords(),
+            Resize(self.size),
+            SubtractMeans(self.mean),
+            lambda img, boxes=None, labels=None: (img / std, boxes, labels),
+            ToTensor(),
+        ])
+
+    def __call__(self, img, boxes, labels):
+        """
+
+        Args:
+            img: the output of cv.imread in RGB layout.
+            boxes: boundding boxes in the form of (x1, y1, x2, y2).
+            labels: labels of boxes.
+        """
+        return self.augment(img, boxes, labels)
+
+
+class TestTransform:
+    def __init__(self, size, mean=0.0, std=1.0):
+        self.transform = Compose([
+            ToPercentCoords(),
+            Resize(size),
+            SubtractMeans(mean),
+            lambda img, boxes=None, labels=None: (img / std, boxes, labels),
+            ToTensor(),
+        ])
+
+    def __call__(self, image, boxes, labels):
+        return self.transform(image, boxes, labels)
+
+
+class PredictionTransform:
+    def __init__(self, size, mean=0.0, std=1.0):
+        self.transform = Compose([
+            Resize(size),
+            SubtractMeans(mean),
+            lambda img, boxes=None, labels=None: (img / std, boxes, labels),
+            ToTensor()
+        ])
+
+    def __call__(self, image):
+        image, _, _ = self.transform(image)
+        return image
\ No newline at end of file
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/fpn_mobilenetv1_ssd.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/fpn_mobilenetv1_ssd.py
new file mode 100644
index 000000000..af2d2166a
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/fpn_mobilenetv1_ssd.py
@@ -0,0 +1,74 @@
+import torch
+from torch.nn import Conv2d, Sequential, ModuleList, ReLU
+from ..nn.mobilenet import MobileNetV1
+
+from .fpn_ssd import FPNSSD
+from .predictor import Predictor
+from .config import mobilenetv1_ssd_config as config
+
+
+def create_fpn_mobilenetv1_ssd(num_classes):
+    base_net = MobileNetV1(1001).features  # disable dropout layer
+
+    source_layer_indexes = [
+        (69, Conv2d(in_channels=512, out_channels=256, kernel_size=1)),
+        (len(base_net), Conv2d(in_channels=1024, out_channels=256, kernel_size=1)),
+    ]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=256, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        )
+    ])
+
+    regression_headers = ModuleList([
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),   #  TODO: change to kernel_size=1, padding=0?
+    ])
+
+    classification_headers = ModuleList([
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1), # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    return FPNSSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers)
+
+
+def create_fpn_mobilenetv1_ssd_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=torch.device('cpu')):
+    predictor = Predictor(net, config.image_size, config.image_mean, config.priors,
+                          config.center_variance, config.size_variance,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/fpn_ssd.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/fpn_ssd.py
new file mode 100644
index 000000000..876d24d3f
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/fpn_ssd.py
@@ -0,0 +1,142 @@
+import torch.nn as nn
+import torch
+import torch.nn.functional as F
+import numpy as np
+from typing import List, Tuple
+
+from ..utils import box_utils
+
+
+class FPNSSD(nn.Module):
+    def __init__(self, num_classes: int, base_net: nn.ModuleList, source_layer_indexes: List[int],
+                 extras: nn.ModuleList, classification_headers: nn.ModuleList,
+                 regression_headers: nn.ModuleList, upsample_mode="nearest"):
+        """Compose a SSD model using the given components.
+        """
+        super(FPNSSD, self).__init__()
+
+        self.num_classes = num_classes
+        self.base_net = base_net
+        self.source_layer_indexes = source_layer_indexes
+        self.extras = extras
+        self.classification_headers = classification_headers
+        self.regression_headers = regression_headers
+        self.upsample_mode = upsample_mode
+
+        # register layers in source_layer_indexes by adding them to a module list
+        self.source_layer_add_ons = nn.ModuleList([t[1] for t in source_layer_indexes if isinstance(t, tuple)])
+        self.upsamplers = [
+            nn.Upsample(size=(19, 19), mode='bilinear'),
+            nn.Upsample(size=(10, 10), mode='bilinear'),
+            nn.Upsample(size=(5, 5), mode='bilinear'),
+            nn.Upsample(size=(3, 3), mode='bilinear'),
+            nn.Upsample(size=(2, 2), mode='bilinear'),
+        ]
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        confidences = []
+        locations = []
+        start_layer_index = 0
+        header_index = 0
+        features = []
+        for end_layer_index in self.source_layer_indexes:
+
+            if isinstance(end_layer_index, tuple):
+                added_layer = end_layer_index[1]
+                end_layer_index = end_layer_index[0]
+            else:
+                added_layer = None
+            for layer in self.base_net[start_layer_index: end_layer_index]:
+                x = layer(x)
+            start_layer_index = end_layer_index
+            if added_layer:
+                y = added_layer(x)
+            else:
+                y = x
+            #confidence, location = self.compute_header(header_index, y)
+            features.append(y)
+            header_index += 1
+            # confidences.append(confidence)
+            # locations.append(location)
+
+        for layer in self.base_net[end_layer_index:]:
+            x = layer(x)
+
+        for layer in self.extras:
+            x = layer(x)
+            #confidence, location = self.compute_header(header_index, x)
+            features.append(x)
+            header_index += 1
+            # confidences.append(confidence)
+            # locations.append(location)
+
+        upstream_feature = None
+        for i in range(len(features) - 1, -1, -1):
+            feature = features[i]
+            if upstream_feature is not None:
+                upstream_feature = self.upsamplers[i](upstream_feature)
+                upstream_feature += feature
+            else:
+                upstream_feature = feature
+            confidence, location = self.compute_header(i, upstream_feature)
+            confidences.append(confidence)
+            locations.append(location)
+        confidences = torch.cat(confidences, 1)
+        locations = torch.cat(locations, 1)
+        return confidences, locations
+
+    def compute_header(self, i, x):
+        confidence = self.classification_headers[i](x)
+        confidence = confidence.permute(0, 2, 3, 1).contiguous()
+        confidence = confidence.view(confidence.size(0), -1, self.num_classes)
+
+        location = self.regression_headers[i](x)
+        location = location.permute(0, 2, 3, 1).contiguous()
+        location = location.view(location.size(0), -1, 4)
+
+        return confidence, location
+
+    def init_from_base_net(self, model):
+        self.base_net.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage), strict=False)
+        self.source_layer_add_ons.apply(_xavier_init_)
+        self.extras.apply(_xavier_init_)
+        self.classification_headers.apply(_xavier_init_)
+        self.regression_headers.apply(_xavier_init_)
+
+    def init(self):
+        self.base_net.apply(_xavier_init_)
+        self.source_layer_add_ons.apply(_xavier_init_)
+        self.extras.apply(_xavier_init_)
+        self.classification_headers.apply(_xavier_init_)
+        self.regression_headers.apply(_xavier_init_)
+
+    def load(self, model):
+        self.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage))
+
+    def save(self, model_path):
+        torch.save(self.state_dict(), model_path)
+
+
+class MatchPrior(object):
+    def __init__(self, center_form_priors, center_variance, size_variance, iou_threshold):
+        self.center_form_priors = center_form_priors
+        self.corner_form_priors = box_utils.center_form_to_corner_form(center_form_priors)
+        self.center_variance = center_variance
+        self.size_variance = size_variance
+        self.iou_threshold = iou_threshold
+
+    def __call__(self, gt_boxes, gt_labels):
+        if type(gt_boxes) is np.ndarray:
+            gt_boxes = torch.from_numpy(gt_boxes)
+        if type(gt_labels) is np.ndarray:
+            gt_labels = torch.from_numpy(gt_labels)
+        boxes, labels = box_utils.assign_priors(gt_boxes, gt_labels,
+                                                self.corner_form_priors, self.iou_threshold)
+        boxes = box_utils.corner_form_to_center_form(boxes)
+        locations = box_utils.convert_boxes_to_locations(boxes, self.center_form_priors, self.center_variance, self.size_variance)
+        return locations, labels
+
+
+def _xavier_init_(m: nn.Module):
+    if isinstance(m, nn.Conv2d):
+        nn.init.xavier_uniform_(m.weight)
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/mobilenet_v2_ssd_lite.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/mobilenet_v2_ssd_lite.py
new file mode 100644
index 000000000..6eab22ac5
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/mobilenet_v2_ssd_lite.py
@@ -0,0 +1,70 @@
+import torch
+from torch.nn import Conv2d, Sequential, ModuleList, BatchNorm2d
+from torch import nn
+from ..nn.mobilenet_v2 import MobileNetV2, InvertedResidual
+
+from .ssd import SSD, GraphPath
+from .predictor import Predictor
+from .config import mobilenetv1_ssd_config as config
+
+
+def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0, onnx_compatible=False):
+    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
+    """
+    ReLU = nn.ReLU if onnx_compatible else nn.ReLU6
+    return Sequential(
+        Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
+               groups=in_channels, stride=stride, padding=padding),
+        BatchNorm2d(in_channels),
+        ReLU(),
+        Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
+    )
+
+
+def create_mobilenetv2_ssd_lite(num_classes, width_mult=1.0, use_batch_norm=True, onnx_compatible=False, is_test=False):
+    base_net = MobileNetV2(width_mult=width_mult, use_batch_norm=use_batch_norm,
+                           onnx_compatible=onnx_compatible).features
+
+    source_layer_indexes = [
+        GraphPath(14, 'conv', 3),
+        19,
+    ]
+    extras = ModuleList([
+        InvertedResidual(1280, 512, stride=2, expand_ratio=0.2),
+        InvertedResidual(512, 256, stride=2, expand_ratio=0.25),
+        InvertedResidual(256, 256, stride=2, expand_ratio=0.5),
+        InvertedResidual(256, 64, stride=2, expand_ratio=0.25)
+    ])
+
+    regression_headers = ModuleList([
+        SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * 4,
+                        kernel_size=3, padding=1, onnx_compatible=False),
+        SeperableConv2d(in_channels=1280, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1, onnx_compatible=False),
+        Conv2d(in_channels=64, out_channels=6 * 4, kernel_size=1),
+    ])
+
+    classification_headers = ModuleList([
+        SeperableConv2d(in_channels=round(576 * width_mult), out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=1280, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=64, out_channels=6 * num_classes, kernel_size=1),
+    ])
+
+    return SSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers, is_test=is_test, config=config)
+
+
+def create_mobilenetv2_ssd_lite_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=torch.device('cpu')):
+    predictor = Predictor(net, config.image_size, config.image_mean,
+                          config.image_std,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/mobilenetv1_ssd.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/mobilenetv1_ssd.py
new file mode 100644
index 000000000..bc739346c
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/mobilenetv1_ssd.py
@@ -0,0 +1,74 @@
+import torch
+from torch.nn import Conv2d, Sequential, ModuleList, ReLU
+from ..nn.mobilenet import MobileNetV1
+
+from .ssd import SSD
+from .predictor import Predictor
+from .config import mobilenetv1_ssd_config as config
+
+
+def create_mobilenetv1_ssd(num_classes, is_test=False, device=None):
+    base_net = MobileNetV1(1001).model  # disable dropout layer
+
+    source_layer_indexes = [
+        12,
+        14,
+    ]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=512, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        )
+    ])
+
+    regression_headers = ModuleList([
+        Conv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=1024, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1), # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    classification_headers = ModuleList([
+        Conv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=1024, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1), # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    return SSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers, is_test=is_test, config=config, device=device)
+
+
+def create_mobilenetv1_ssd_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=None):
+    predictor = Predictor(net, config.image_size, config.image_mean,
+                          config.image_std,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/mobilenetv1_ssd_lite.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/mobilenetv1_ssd_lite.py
new file mode 100644
index 000000000..a8492de0e
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/mobilenetv1_ssd_lite.py
@@ -0,0 +1,81 @@
+import torch
+from torch.nn import Conv2d, Sequential, ModuleList, ReLU, BatchNorm2d
+from ..nn.mobilenet import MobileNetV1
+
+from .ssd import SSD
+from .predictor import Predictor
+from .config import mobilenetv1_ssd_config as config
+
+
+def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0):
+    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
+    """
+    return Sequential(
+        Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
+               groups=in_channels, stride=stride, padding=padding),
+        ReLU(),
+        Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
+    )
+
+
+def create_mobilenetv1_ssd_lite(num_classes, is_test=False):
+    base_net = MobileNetV1(1001).model  # disable dropout layer
+
+    source_layer_indexes = [
+        12,
+        14,
+    ]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=512, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
+        )
+    ])
+
+    regression_headers = ModuleList([
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=1024, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=1),
+    ])
+
+    classification_headers = ModuleList([
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=1024, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=1),
+    ])
+
+    return SSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers, is_test=is_test, config=config)
+
+
+def create_mobilenetv1_ssd_lite_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=None):
+    predictor = Predictor(net, config.image_size, config.image_mean,
+                          config.image_std,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/predictor.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/predictor.py
new file mode 100644
index 000000000..0f741071e
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/predictor.py
@@ -0,0 +1,130 @@
+import os
+import torch
+import time
+
+from ..utils import box_utils
+from .data_preprocessing import PredictionTransform
+from ..utils.misc import Timer
+
+
+class Predictor:
+    def __init__(self, net, size, mean=0.0, std=1.0, nms_method=None,
+                 iou_threshold=0.45, filter_threshold=0.01, candidate_size=200, sigma=0.5, device=None):
+        self.net = net
+        self.transform = PredictionTransform(size, mean, std)
+        self.iou_threshold = iou_threshold
+        self.filter_threshold = filter_threshold
+        self.candidate_size = candidate_size
+        self.nms_method = nms_method
+
+        self.sigma = sigma
+        if device:
+            self.device = device
+        else:
+            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+
+        self.net.to(self.device)
+        self.net.eval()
+
+        self.timer = Timer()
+
+    def predict(self, flag, channels_flag, image, top_k=-1, prob_threshold=None, profiling=False, args=None, iter=0):
+        cpu_device = torch.device("cpu")
+        if args is not None and args.dummy > 0:
+            height = args.image_size
+            width = args.image_size
+            images = torch.randn(args.batch_size, 3, height, height)
+        else:
+            height, width, _ = image.shape
+            image = self.transform(image)
+            images = image.unsqueeze(0)
+        if flag == 1:
+            images = images.to(torch.half)
+        elif flag == 2:
+            images = images.to(torch.bfloat16)
+        if channels_flag == 1:
+            images = images.to(memory_format=torch.channels_last)
+        if args is not None and args.benchmark == 1:
+            images = images.to(self.device)
+        with torch.autograd.profiler_legacy.profile(enabled=profiling, use_xpu=True, record_shapes=False) as prof:
+            with torch.inference_mode():
+                self.timer.start()
+                if args is None or args.benchmark == 0:
+                    images = images.to(self.device)
+                scores, boxes = self.net.forward(images)
+
+                if flag > 0:
+                    boxes = boxes[0].to(torch.float)
+                    scores = scores[0].to(torch.float)
+                else:
+                    boxes = boxes[0]
+                    scores = scores[0]
+                if not prob_threshold:
+                    prob_threshold = self.filter_threshold
+
+                # sync for time measurement
+                torch.xpu.synchronize()
+                if args is not None and args.benchmark == 1:
+                    print("Inference time: ", self.timer.end())
+
+        # this version of nms is slower on GPU, so we move data to CPU.
+        boxes = boxes.to(cpu_device)
+        scores = scores.to(cpu_device)
+
+        post_start_time = time.time()
+        picked_box_probs = []
+        picked_labels = []
+        for class_index in range(1, scores.size(1)):
+            probs = scores[:, class_index]
+            mask = probs > prob_threshold
+            probs = probs[mask]
+            if probs.size(0) == 0:
+                continue
+            subset_boxes = boxes[mask, :]
+            box_probs = torch.cat([subset_boxes, probs.reshape(-1, 1)], dim=1)
+            box_probs = box_utils.nms(box_probs, self.nms_method,
+                                      score_threshold=prob_threshold,
+                                      iou_threshold=self.iou_threshold,
+                                      sigma=self.sigma,
+                                      top_k=top_k,
+                                      candidate_size=self.candidate_size)
+            picked_box_probs.append(box_probs)
+            picked_labels.extend([class_index] * box_probs.size(0))
+        if not picked_box_probs:
+            return torch.tensor([]), torch.tensor([]), torch.tensor([])
+        picked_box_probs = torch.cat(picked_box_probs)
+        picked_box_probs[:, 0] *= width
+        picked_box_probs[:, 1] *= height
+        picked_box_probs[:, 2] *= width
+        picked_box_probs[:, 3] *= height
+
+        ret = picked_box_probs[:, :4], torch.tensor(picked_labels), picked_box_probs[:, 4]
+        post_end_time = time.time()
+        print("Post time: ", post_end_time - post_start_time)
+        if args is None or args.benchmark == 0:
+            print("Inference time: ", self.timer.end())
+
+        if profiling and iter==args.profile_iter:
+            title = "/ssd_mobilenetv1_inference_"
+            if args.channels_last:
+                title += "channels_last_"
+            else:
+                title += "block_"
+            if args.bf16:
+                title += "bf16_"
+            if args.fp16:
+                title += "fp16_"
+            if args.int8:
+                title += "int8_"
+            if args.batch_size:
+                title += "bs" + str(args.batch_size) + "_"
+
+            profiling_path = os.getenv('PROFILE_PATH')
+            if not profiling_path:
+                profiling_path = './'
+            torch.save(prof.key_averages().table(sort_by="self_xpu_time_total"), profiling_path + title + 'profiling.pt')
+            torch.save(prof.key_averages(group_by_input_shape=True).table(), profiling_path + title + 'profiling_detailed.pt')
+            prof.export_chrome_trace(profiling_path + title + 'profiling.json')
+            print(prof.key_averages().table(sort_by="self_xpu_time_total"))
+            print(prof.key_averages(group_by_input_shape=True).table())
+        return ret
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/squeezenet_ssd_lite.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/squeezenet_ssd_lite.py
new file mode 100644
index 000000000..947ca0231
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/squeezenet_ssd_lite.py
@@ -0,0 +1,85 @@
+import torch
+from torch.nn import Conv2d, Sequential, ModuleList, ReLU
+from ..nn.squeezenet import squeezenet1_1
+
+from .ssd import SSD
+from .predictor import Predictor
+from .config import squeezenet_ssd_config as config
+
+
+def SeperableConv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0):
+    """Replace Conv2d with a depthwise Conv2d and Pointwise Conv2d.
+    """
+    return Sequential(
+        Conv2d(in_channels=in_channels, out_channels=in_channels, kernel_size=kernel_size,
+               groups=in_channels, stride=stride, padding=padding),
+        ReLU(),
+        Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
+    )
+
+
+def create_squeezenet_ssd_lite(num_classes, is_test=False):
+    base_net = squeezenet1_1(False).features  # disable dropout layer
+
+    source_layer_indexes = [
+        12
+    ]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(in_channels=512, out_channels=256, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=2),
+        ),
+        Sequential(
+            Conv2d(in_channels=512, out_channels=256, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=512, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            SeperableConv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1)
+        )
+    ])
+
+    regression_headers = ModuleList([
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=1),
+    ])
+
+    classification_headers = ModuleList([
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        SeperableConv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=1),
+    ])
+
+    return SSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers, is_test=is_test, config=config)
+
+
+def create_squeezenet_ssd_lite_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=torch.device('cpu')):
+    predictor = Predictor(net, config.image_size, config.image_mean,
+                          config.image_std,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
\ No newline at end of file
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/ssd.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/ssd.py
new file mode 100644
index 000000000..565cf591a
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/ssd.py
@@ -0,0 +1,162 @@
+import torch.nn as nn
+import torch
+import numpy as np
+from typing import List, Tuple
+import torch.nn.functional as F
+
+from ..utils import box_utils
+from collections import namedtuple
+GraphPath = namedtuple("GraphPath", ['s0', 'name', 's1'])  #
+
+
+class SSD(nn.Module):
+    def __init__(self, num_classes: int, base_net: nn.ModuleList, source_layer_indexes: List[int],
+                 extras: nn.ModuleList, classification_headers: nn.ModuleList,
+                 regression_headers: nn.ModuleList, is_test=False, config=None, device=None):
+        """Compose a SSD model using the given components.
+        """
+        super(SSD, self).__init__()
+
+        self.num_classes = num_classes
+        self.base_net = base_net
+        self.source_layer_indexes = source_layer_indexes
+        self.extras = extras
+        self.classification_headers = classification_headers
+        self.regression_headers = regression_headers
+        self.is_test = is_test
+        self.config = config
+
+        # register layers in source_layer_indexes by adding them to a module list
+        self.source_layer_add_ons = nn.ModuleList([t[1] for t in source_layer_indexes
+                                                   if isinstance(t, tuple) and not isinstance(t, GraphPath)])
+        if device:
+            self.device = device
+        else:
+            self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        if is_test:
+            self.config = config
+            self.priors = config.priors.to(self.device)
+
+    def forward(self, x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        confidences = []
+        locations = []
+        start_layer_index = 0
+        header_index = 0
+        for end_layer_index in self.source_layer_indexes:
+            if isinstance(end_layer_index, GraphPath):
+                path = end_layer_index
+                end_layer_index = end_layer_index.s0
+                added_layer = None
+            elif isinstance(end_layer_index, tuple):
+                added_layer = end_layer_index[1]
+                end_layer_index = end_layer_index[0]
+                path = None
+            else:
+                added_layer = None
+                path = None
+            for layer in self.base_net[start_layer_index: end_layer_index]:
+                x = layer(x)
+            if added_layer:
+                y = added_layer(x)
+            else:
+                y = x
+            if path:
+                sub = getattr(self.base_net[end_layer_index], path.name)
+                for layer in sub[:path.s1]:
+                    x = layer(x)
+                y = x
+                for layer in sub[path.s1:]:
+                    x = layer(x)
+                end_layer_index += 1
+            start_layer_index = end_layer_index
+            confidence, location = self.compute_header(header_index, y)
+            header_index += 1
+            confidences.append(confidence)
+            locations.append(location)
+
+        for layer in self.base_net[end_layer_index:]:
+            x = layer(x)
+
+        for layer in self.extras:
+            x = layer(x)
+            confidence, location = self.compute_header(header_index, x)
+            header_index += 1
+            confidences.append(confidence)
+            locations.append(location)
+
+        confidences = torch.cat(confidences, 1)
+        # (Todo zhiyuan) Convert to float due to Float16 += BFloat16 not supported
+        # by TensorIterator yet, while Float32 += Half is OK
+        locations = torch.cat(locations, 1).to(torch.float)
+
+        if self.is_test:
+            confidences = F.softmax(confidences, dim=2)
+            boxes = torch.xpu.locations_to_boxes(locations, self.priors, self.config.center_variance, self.config.size_variance);
+            return confidences, boxes
+        else:
+            return confidences, locations
+
+    def compute_header(self, i, x):
+        confidence = self.classification_headers[i](x)
+        confidence = confidence.permute(0, 2, 3, 1).contiguous()
+        confidence = confidence.view(confidence.size(0), -1, self.num_classes)
+
+        location = self.regression_headers[i](x)
+        location = location.permute(0, 2, 3, 1).contiguous()
+        location = location.view(location.size(0), -1, 4)
+
+        return confidence, location
+
+    def init_from_base_net(self, model):
+        self.base_net.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage), strict=True)
+        self.source_layer_add_ons.apply(_xavier_init_)
+        self.extras.apply(_xavier_init_)
+        self.classification_headers.apply(_xavier_init_)
+        self.regression_headers.apply(_xavier_init_)
+
+    def init_from_pretrained_ssd(self, model):
+        state_dict = torch.load(model, map_location=lambda storage, loc: storage)
+        state_dict = {k: v for k, v in state_dict.items() if not (k.startswith("classification_headers") or k.startswith("regression_headers"))}
+        model_dict = self.state_dict()
+        model_dict.update(state_dict)
+        self.load_state_dict(model_dict)
+        self.classification_headers.apply(_xavier_init_)
+        self.regression_headers.apply(_xavier_init_)
+
+    def init(self):
+        self.base_net.apply(_xavier_init_)
+        self.source_layer_add_ons.apply(_xavier_init_)
+        self.extras.apply(_xavier_init_)
+        self.classification_headers.apply(_xavier_init_)
+        self.regression_headers.apply(_xavier_init_)
+
+    def load(self, model):
+        self.load_state_dict(torch.load(model, map_location=lambda storage, loc: storage))
+
+    def save(self, model_path):
+        torch.save(self.state_dict(), model_path)
+
+
+class MatchPrior(object):
+    def __init__(self, center_form_priors, center_variance, size_variance, iou_threshold):
+        self.center_form_priors = center_form_priors
+        self.corner_form_priors = box_utils.center_form_to_corner_form(center_form_priors)
+        self.center_variance = center_variance
+        self.size_variance = size_variance
+        self.iou_threshold = iou_threshold
+
+    def __call__(self, gt_boxes, gt_labels):
+        if type(gt_boxes) is np.ndarray:
+            gt_boxes = torch.from_numpy(gt_boxes)
+        if type(gt_labels) is np.ndarray:
+            gt_labels = torch.from_numpy(gt_labels)
+        boxes, labels = box_utils.assign_priors(gt_boxes, gt_labels,
+                                                self.corner_form_priors, self.iou_threshold)
+        boxes = box_utils.corner_form_to_center_form(boxes)
+        locations = box_utils.convert_boxes_to_locations(boxes, self.center_form_priors, self.center_variance, self.size_variance)
+        return locations, labels
+
+
+def _xavier_init_(m: nn.Module):
+    if isinstance(m, nn.Conv2d):
+        nn.init.xavier_uniform_(m.weight)
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/vgg_ssd.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/vgg_ssd.py
new file mode 100644
index 000000000..051d45cd0
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/ssd/vgg_ssd.py
@@ -0,0 +1,75 @@
+import torch
+from torch.nn import Conv2d, Sequential, ModuleList, ReLU, BatchNorm2d
+from ..nn.vgg import vgg
+
+from .ssd import SSD
+from .predictor import Predictor
+from .config import vgg_ssd_config as config
+
+
+def create_vgg_ssd(num_classes, is_test=False):
+    vgg_config = [64, 64, 'M', 128, 128, 'M', 256, 256, 256, 'C', 512, 512, 512, 'M',
+                  512, 512, 512]
+    base_net = ModuleList(vgg(vgg_config))
+
+    source_layer_indexes = [
+        (23, BatchNorm2d(512)),
+        len(base_net),
+    ]
+    extras = ModuleList([
+        Sequential(
+            Conv2d(in_channels=1024, out_channels=256, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=512, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2, padding=1),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3),
+            ReLU()
+        ),
+        Sequential(
+            Conv2d(in_channels=256, out_channels=128, kernel_size=1),
+            ReLU(),
+            Conv2d(in_channels=128, out_channels=256, kernel_size=3),
+            ReLU()
+        )
+    ])
+
+    regression_headers = ModuleList([
+        Conv2d(in_channels=512, out_channels=4 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=1024, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=512, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=4 * 4, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=4 * 4, kernel_size=3, padding=1), # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    classification_headers = ModuleList([
+        Conv2d(in_channels=512, out_channels=4 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=1024, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=512, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=6 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=4 * num_classes, kernel_size=3, padding=1),
+        Conv2d(in_channels=256, out_channels=4 * num_classes, kernel_size=3, padding=1), # TODO: change to kernel_size=1, padding=0?
+    ])
+
+    return SSD(num_classes, base_net, source_layer_indexes,
+               extras, classification_headers, regression_headers, is_test=is_test, config=config)
+
+
+def create_vgg_ssd_predictor(net, candidate_size=200, nms_method=None, sigma=0.5, device=None):
+    predictor = Predictor(net, config.image_size, config.image_mean,
+                          nms_method=nms_method,
+                          iou_threshold=config.iou_threshold,
+                          candidate_size=candidate_size,
+                          sigma=sigma,
+                          device=device)
+    return predictor
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/test/__init__.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/test/assets/000138.jpg b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/test/assets/000138.jpg
new file mode 100644
index 000000000..6e4746ef0
Binary files /dev/null and b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/test/assets/000138.jpg differ
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/test/test_vgg_ssd.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/test/test_vgg_ssd.py
new file mode 100644
index 000000000..4dae62bbe
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/test/test_vgg_ssd.py
@@ -0,0 +1,48 @@
+from ..ssd.vgg_ssd import create_vgg_ssd
+
+import torch
+import tempfile
+
+
+def test_create_vgg_ssd():
+    for num_classes in [2, 10, 21, 100]:
+        _ = create_vgg_ssd(num_classes)
+
+
+def test_forward():
+    for num_classes in [2]:
+        net = create_vgg_ssd(num_classes)
+        net.init()
+        net.eval()
+        x = torch.randn(2, 3, 300, 300)
+        confidences, locations = net.forward(x)
+        assert confidences.size() == torch.Size([2, 8732, num_classes])
+        assert locations.size() == torch.Size([2, 8732, 4])
+        assert confidences.nonzero().size(0) != 0
+        assert locations.nonzero().size(0) != 0
+
+
+def test_save_model():
+    net = create_vgg_ssd(10)
+    net.init()
+    with tempfile.TemporaryFile() as f:
+        net.save(f)
+
+
+def test_save_load_model_consistency():
+    net = create_vgg_ssd(20)
+    net.init()
+    model_path = tempfile.NamedTemporaryFile().name
+    net.save(model_path)
+    net_copy = create_vgg_ssd(20)
+    net_copy.load(model_path)
+
+    net.eval()
+    net_copy.eval()
+
+    for _ in range(1):
+        x = torch.randn(1, 3, 300, 300)
+        confidences1, locations1 = net.forward(x)
+        confidences2, locations2 = net_copy.forward(x)
+        assert (confidences1 == confidences2).long().sum() == confidences2.numel()
+        assert (locations1 == locations2).long().sum() == locations2.numel()
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/transforms/__init__.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/transforms/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/transforms/transforms.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/transforms/transforms.py
new file mode 100644
index 000000000..827deb25c
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/transforms/transforms.py
@@ -0,0 +1,407 @@
+# from https://github.com/amdegroot/ssd.pytorch
+
+
+import torch
+from torchvision import transforms
+import cv2
+import numpy as np
+import types
+from numpy import random
+
+
+def intersect(box_a, box_b):
+    max_xy = np.minimum(box_a[:, 2:], box_b[2:])
+    min_xy = np.maximum(box_a[:, :2], box_b[:2])
+    inter = np.clip((max_xy - min_xy), a_min=0, a_max=np.inf)
+    return inter[:, 0] * inter[:, 1]
+
+
+def jaccard_numpy(box_a, box_b):
+    """Compute the jaccard overlap of two sets of boxes.  The jaccard overlap
+    is simply the intersection over union of two boxes.
+    E.g.:
+        A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B)
+    Args:
+        box_a: Multiple bounding boxes, Shape: [num_boxes,4]
+        box_b: Single bounding box, Shape: [4]
+    Return:
+        jaccard overlap: Shape: [box_a.shape[0], box_a.shape[1]]
+    """
+    inter = intersect(box_a, box_b)
+    area_a = ((box_a[:, 2]-box_a[:, 0]) *
+              (box_a[:, 3]-box_a[:, 1]))  # [A,B]
+    area_b = ((box_b[2]-box_b[0]) *
+              (box_b[3]-box_b[1]))  # [A,B]
+    union = area_a + area_b - inter
+    return inter / union  # [A,B]
+
+
+class Compose(object):
+    """Composes several augmentations together.
+    Args:
+        transforms (List[Transform]): list of transforms to compose.
+    Example:
+        >>> augmentations.Compose([
+        >>>     transforms.CenterCrop(10),
+        >>>     transforms.ToTensor(),
+        >>> ])
+    """
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, img, boxes=None, labels=None):
+        for t in self.transforms:
+            img, boxes, labels = t(img, boxes, labels)
+        return img, boxes, labels
+
+
+class Lambda(object):
+    """Applies a lambda as a transform."""
+
+    def __init__(self, lambd):
+        assert isinstance(lambd, types.LambdaType)
+        self.lambd = lambd
+
+    def __call__(self, img, boxes=None, labels=None):
+        return self.lambd(img, boxes, labels)
+
+
+class ConvertFromInts(object):
+    def __call__(self, image, boxes=None, labels=None):
+        return image.astype(np.float32), boxes, labels
+
+
+class SubtractMeans(object):
+    def __init__(self, mean):
+        self.mean = np.array(mean, dtype=np.float32)
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = image.astype(np.float32)
+        image -= self.mean
+        return image.astype(np.float32), boxes, labels
+
+
+class ToAbsoluteCoords(object):
+    def __call__(self, image, boxes=None, labels=None):
+        height, width, channels = image.shape
+        boxes[:, 0] *= width
+        boxes[:, 2] *= width
+        boxes[:, 1] *= height
+        boxes[:, 3] *= height
+
+        return image, boxes, labels
+
+
+class ToPercentCoords(object):
+    def __call__(self, image, boxes=None, labels=None):
+        height, width, channels = image.shape
+        boxes[:, 0] /= width
+        boxes[:, 2] /= width
+        boxes[:, 1] /= height
+        boxes[:, 3] /= height
+
+        return image, boxes, labels
+
+
+class Resize(object):
+    def __init__(self, size=300):
+        self.size = size
+
+    def __call__(self, image, boxes=None, labels=None):
+        image = cv2.resize(image, (self.size,
+                                 self.size))
+        return image, boxes, labels
+
+
+class RandomSaturation(object):
+    def __init__(self, lower=0.5, upper=1.5):
+        self.lower = lower
+        self.upper = upper
+        assert self.upper >= self.lower, "contrast upper must be >= lower."
+        assert self.lower >= 0, "contrast lower must be non-negative."
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            image[:, :, 1] *= random.uniform(self.lower, self.upper)
+
+        return image, boxes, labels
+
+
+class RandomHue(object):
+    def __init__(self, delta=18.0):
+        assert delta >= 0.0 and delta <= 360.0
+        self.delta = delta
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            image[:, :, 0] += random.uniform(-self.delta, self.delta)
+            image[:, :, 0][image[:, :, 0] > 360.0] -= 360.0
+            image[:, :, 0][image[:, :, 0] < 0.0] += 360.0
+        return image, boxes, labels
+
+
+class RandomLightingNoise(object):
+    def __init__(self):
+        self.perms = ((0, 1, 2), (0, 2, 1),
+                      (1, 0, 2), (1, 2, 0),
+                      (2, 0, 1), (2, 1, 0))
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            swap = self.perms[random.randint(len(self.perms))]
+            shuffle = SwapChannels(swap)  # shuffle channels
+            image = shuffle(image)
+        return image, boxes, labels
+
+
+class ConvertColor(object):
+    def __init__(self, current, transform):
+        self.transform = transform
+        self.current = current
+
+    def __call__(self, image, boxes=None, labels=None):
+        if self.current == 'BGR' and self.transform == 'HSV':
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
+        elif self.current == 'RGB' and self.transform == 'HSV':
+            image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
+        elif self.current == 'BGR' and self.transform == 'RGB':
+            image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+        elif self.current == 'HSV' and self.transform == 'BGR':
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2BGR)
+        elif self.current == 'HSV' and self.transform == "RGB":
+            image = cv2.cvtColor(image, cv2.COLOR_HSV2RGB)
+        else:
+            raise NotImplementedError
+        return image, boxes, labels
+
+
+class RandomContrast(object):
+    def __init__(self, lower=0.5, upper=1.5):
+        self.lower = lower
+        self.upper = upper
+        assert self.upper >= self.lower, "contrast upper must be >= lower."
+        assert self.lower >= 0, "contrast lower must be non-negative."
+
+    # expects float image
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            alpha = random.uniform(self.lower, self.upper)
+            image *= alpha
+        return image, boxes, labels
+
+
+class RandomBrightness(object):
+    def __init__(self, delta=32):
+        assert delta >= 0.0
+        assert delta <= 255.0
+        self.delta = delta
+
+    def __call__(self, image, boxes=None, labels=None):
+        if random.randint(2):
+            delta = random.uniform(-self.delta, self.delta)
+            image += delta
+        return image, boxes, labels
+
+
+class ToCV2Image(object):
+    def __call__(self, tensor, boxes=None, labels=None):
+        return tensor.cpu().numpy().astype(np.float32).transpose((1, 2, 0)), boxes, labels
+
+
+class ToTensor(object):
+    def __call__(self, cvimage, boxes=None, labels=None):
+        return torch.from_numpy(cvimage.astype(np.float32)).permute(2, 0, 1), boxes, labels
+
+
+class RandomSampleCrop(object):
+    """Crop
+    Arguments:
+        img (Image): the image being input during training
+        boxes (Tensor): the original bounding boxes in pt form
+        labels (Tensor): the class labels for each bbox
+        mode (float tuple): the min and max jaccard overlaps
+    Return:
+        (img, boxes, classes)
+            img (Image): the cropped image
+            boxes (Tensor): the adjusted bounding boxes in pt form
+            labels (Tensor): the class labels for each bbox
+    """
+    def __init__(self):
+        self.sample_options = (
+            # using entire original input image
+            None,
+            # sample a patch s.t. MIN jaccard w/ obj in .1,.3,.4,.7,.9
+            (0.1, None),
+            (0.3, None),
+            (0.7, None),
+            (0.9, None),
+            # randomly sample a patch
+            (None, None),
+        )
+
+    def __call__(self, image, boxes=None, labels=None):
+        height, width, _ = image.shape
+        while True:
+            # randomly choose a mode
+            mode = random.choice(self.sample_options)
+            if mode is None:
+                return image, boxes, labels
+
+            min_iou, max_iou = mode
+            if min_iou is None:
+                min_iou = float('-inf')
+            if max_iou is None:
+                max_iou = float('inf')
+
+            # max trails (50)
+            for _ in range(50):
+                current_image = image
+
+                w = random.uniform(0.3 * width, width)
+                h = random.uniform(0.3 * height, height)
+
+                # aspect ratio constraint b/t .5 & 2
+                if h / w < 0.5 or h / w > 2:
+                    continue
+
+                left = random.uniform(width - w)
+                top = random.uniform(height - h)
+
+                # convert to integer rect x1,y1,x2,y2
+                rect = np.array([int(left), int(top), int(left+w), int(top+h)])
+
+                # calculate IoU (jaccard overlap) b/t the cropped and gt boxes
+                overlap = jaccard_numpy(boxes, rect)
+
+                # is min and max overlap constraint satisfied? if not try again
+                if overlap.min() < min_iou and max_iou < overlap.max():
+                    continue
+
+                # cut the crop from the image
+                current_image = current_image[rect[1]:rect[3], rect[0]:rect[2],
+                                              :]
+
+                # keep overlap with gt box IF center in sampled patch
+                centers = (boxes[:, :2] + boxes[:, 2:]) / 2.0
+
+                # mask in all gt boxes that above and to the left of centers
+                m1 = (rect[0] < centers[:, 0]) * (rect[1] < centers[:, 1])
+
+                # mask in all gt boxes that under and to the right of centers
+                m2 = (rect[2] > centers[:, 0]) * (rect[3] > centers[:, 1])
+
+                # mask in that both m1 and m2 are true
+                mask = m1 * m2
+
+                # have any valid boxes? try again if not
+                if not mask.any():
+                    continue
+
+                # take only matching gt boxes
+                current_boxes = boxes[mask, :].copy()
+
+                # take only matching gt labels
+                current_labels = labels[mask]
+
+                # should we use the box left and top corner or the crop's
+                current_boxes[:, :2] = np.maximum(current_boxes[:, :2],
+                                                  rect[:2])
+                # adjust to crop (by substracting crop's left,top)
+                current_boxes[:, :2] -= rect[:2]
+
+                current_boxes[:, 2:] = np.minimum(current_boxes[:, 2:],
+                                                  rect[2:])
+                # adjust to crop (by substracting crop's left,top)
+                current_boxes[:, 2:] -= rect[:2]
+
+                return current_image, current_boxes, current_labels
+
+
+class Expand(object):
+    def __init__(self, mean):
+        self.mean = mean
+
+    def __call__(self, image, boxes, labels):
+        if random.randint(2):
+            return image, boxes, labels
+
+        height, width, depth = image.shape
+        ratio = random.uniform(1, 4)
+        left = random.uniform(0, width*ratio - width)
+        top = random.uniform(0, height*ratio - height)
+
+        expand_image = np.zeros(
+            (int(height*ratio), int(width*ratio), depth),
+            dtype=image.dtype)
+        expand_image[:, :, :] = self.mean
+        expand_image[int(top):int(top + height),
+                     int(left):int(left + width)] = image
+        image = expand_image
+
+        boxes = boxes.copy()
+        boxes[:, :2] += (int(left), int(top))
+        boxes[:, 2:] += (int(left), int(top))
+
+        return image, boxes, labels
+
+
+class RandomMirror(object):
+    def __call__(self, image, boxes, classes):
+        _, width, _ = image.shape
+        if random.randint(2):
+            image = image[:, ::-1]
+            boxes = boxes.copy()
+            boxes[:, 0::2] = width - boxes[:, 2::-2]
+        return image, boxes, classes
+
+
+class SwapChannels(object):
+    """Transforms a tensorized image by swapping the channels in the order
+     specified in the swap tuple.
+    Args:
+        swaps (int triple): final order of channels
+            eg: (2, 1, 0)
+    """
+
+    def __init__(self, swaps):
+        self.swaps = swaps
+
+    def __call__(self, image):
+        """
+        Args:
+            image (Tensor): image tensor to be transformed
+        Return:
+            a tensor with channels swapped according to swap
+        """
+        # if torch.is_tensor(image):
+        #     image = image.data.cpu().numpy()
+        # else:
+        #     image = np.array(image)
+        image = image[:, :, self.swaps]
+        return image
+
+
+class PhotometricDistort(object):
+    def __init__(self):
+        self.pd = [
+            RandomContrast(),  # RGB
+            ConvertColor(current="RGB", transform='HSV'),  # HSV
+            RandomSaturation(),  # HSV
+            RandomHue(),  # HSV
+            ConvertColor(current='HSV', transform='RGB'),  # RGB
+            RandomContrast()  # RGB
+        ]
+        self.rand_brightness = RandomBrightness()
+        self.rand_light_noise = RandomLightingNoise()
+
+    def __call__(self, image, boxes, labels):
+        im = image.copy()
+        im, boxes, labels = self.rand_brightness(im, boxes, labels)
+        if random.randint(2):
+            distort = Compose(self.pd[:-1])
+        else:
+            distort = Compose(self.pd[1:])
+        im, boxes, labels = distort(im, boxes, labels)
+        return self.rand_light_noise(im, boxes, labels)
+
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/__init__.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/__init__.py
new file mode 100644
index 000000000..0789bdb39
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/__init__.py
@@ -0,0 +1 @@
+from .misc import *
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/box_utils.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/box_utils.py
new file mode 100644
index 000000000..42f246955
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/box_utils.py
@@ -0,0 +1,295 @@
+import collections
+import torch
+import itertools
+from typing import List
+import math
+
+SSDBoxSizes = collections.namedtuple('SSDBoxSizes', ['min', 'max'])
+
+SSDSpec = collections.namedtuple('SSDSpec', ['feature_map_size', 'shrinkage', 'box_sizes', 'aspect_ratios'])
+
+
+def generate_ssd_priors(specs: List[SSDSpec], image_size, clamp=True) -> torch.Tensor:
+    """Generate SSD Prior Boxes.
+
+    It returns the center, height and width of the priors. The values are relative to the image size
+    Args:
+        specs: SSDSpecs about the shapes of sizes of prior boxes. i.e.
+            specs = [
+                SSDSpec(38, 8, SSDBoxSizes(30, 60), [2]),
+                SSDSpec(19, 16, SSDBoxSizes(60, 111), [2, 3]),
+                SSDSpec(10, 32, SSDBoxSizes(111, 162), [2, 3]),
+                SSDSpec(5, 64, SSDBoxSizes(162, 213), [2, 3]),
+                SSDSpec(3, 100, SSDBoxSizes(213, 264), [2]),
+                SSDSpec(1, 300, SSDBoxSizes(264, 315), [2])
+            ]
+        image_size: image size.
+        clamp: if true, clamp the values to make fall between [0.0, 1.0]
+    Returns:
+        priors (num_priors, 4): The prior boxes represented as [[center_x, center_y, w, h]]. All the values
+            are relative to the image size.
+    """
+    priors = []
+    for spec in specs:
+        scale = image_size / spec.shrinkage
+        for j, i in itertools.product(range(spec.feature_map_size), repeat=2):
+            x_center = (i + 0.5) / scale
+            y_center = (j + 0.5) / scale
+
+            # small sized square box
+            size = spec.box_sizes.min
+            h = w = size / image_size
+            priors.append([
+                x_center,
+                y_center,
+                w,
+                h
+            ])
+
+            # big sized square box
+            size = math.sqrt(spec.box_sizes.max * spec.box_sizes.min)
+            h = w = size / image_size
+            priors.append([
+                x_center,
+                y_center,
+                w,
+                h
+            ])
+
+            # change h/w ratio of the small sized box
+            size = spec.box_sizes.min
+            h = w = size / image_size
+            for ratio in spec.aspect_ratios:
+                ratio = math.sqrt(ratio)
+                priors.append([
+                    x_center,
+                    y_center,
+                    w * ratio,
+                    h / ratio
+                ])
+                priors.append([
+                    x_center,
+                    y_center,
+                    w / ratio,
+                    h * ratio
+                ])
+
+    priors = torch.tensor(priors)
+    if clamp:
+        torch.clamp(priors, 0.0, 1.0, out=priors)
+    return priors
+
+
+def convert_locations_to_boxes(locations, priors, center_variance,
+                               size_variance):
+    """Convert regressional location results of SSD into boxes in the form of (center_x, center_y, h, w).
+
+    The conversion:
+        $$predicted\_center * center_variance = \frac {real\_center - prior\_center} {prior\_hw}$$
+        $$exp(predicted\_hw * size_variance) = \frac {real\_hw} {prior\_hw}$$
+    We do it in the inverse direction here.
+    Args:
+        locations (batch_size, num_priors, 4): the regression output of SSD. It will contain the outputs as well.
+        priors (num_priors, 4) or (batch_size/1, num_priors, 4): prior boxes.
+        center_variance: a float used to change the scale of center.
+        size_variance: a float used to change of scale of size.
+    Returns:
+        boxes:  priors: [[center_x, center_y, h, w]]. All the values
+            are relative to the image size.
+    """
+    # priors can have one dimension less.
+    if priors.dim() + 1 == locations.dim():
+        priors = priors.unsqueeze(0)
+    return torch.cat([
+        locations[..., :2] * center_variance * priors[..., 2:] + priors[..., :2],
+        torch.exp(locations[..., 2:] * size_variance) * priors[..., 2:]
+    ], dim=locations.dim() - 1)
+
+
+def convert_boxes_to_locations(center_form_boxes, center_form_priors, center_variance, size_variance):
+    # priors can have one dimension less
+    if center_form_priors.dim() + 1 == center_form_boxes.dim():
+        center_form_priors = center_form_priors.unsqueeze(0)
+    return torch.cat([
+        (center_form_boxes[..., :2] - center_form_priors[..., :2]) / center_form_priors[..., 2:] / center_variance,
+        torch.log(center_form_boxes[..., 2:] / center_form_priors[..., 2:]) / size_variance
+    ], dim=center_form_boxes.dim() - 1)
+
+
+def area_of(left_top, right_bottom) -> torch.Tensor:
+    """Compute the areas of rectangles given two corners.
+
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+
+    Returns:
+        area (N): return the area.
+    """
+    hw = torch.clamp(right_bottom - left_top, min=0.0)
+    return hw[..., 0] * hw[..., 1]
+
+
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = torch.max(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = torch.min(boxes0[..., 2:], boxes1[..., 2:])
+
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+
+
+def assign_priors(gt_boxes, gt_labels, corner_form_priors,
+                  iou_threshold):
+    """Assign ground truth boxes and targets to priors.
+
+    Args:
+        gt_boxes (num_targets, 4): ground truth boxes.
+        gt_labels (num_targets): labels of targets.
+        priors (num_priors, 4): corner form priors
+    Returns:
+        boxes (num_priors, 4): real values for priors.
+        labels (num_priros): labels for priors.
+    """
+    # size: num_priors x num_targets
+    ious = iou_of(gt_boxes.unsqueeze(0), corner_form_priors.unsqueeze(1))
+    # size: num_priors
+    best_target_per_prior, best_target_per_prior_index = ious.max(1)
+    # size: num_targets
+    best_prior_per_target, best_prior_per_target_index = ious.max(0)
+
+    for target_index, prior_index in enumerate(best_prior_per_target_index):
+        best_target_per_prior_index[prior_index] = target_index
+    # 2.0 is used to make sure every target has a prior assigned
+    best_target_per_prior.index_fill_(0, best_prior_per_target_index, 2)
+    # size: num_priors
+    labels = gt_labels[best_target_per_prior_index]
+    labels[best_target_per_prior < iou_threshold] = 0  # the backgournd id
+    boxes = gt_boxes[best_target_per_prior_index]
+    return boxes, labels
+
+
+def hard_negative_mining(loss, labels, neg_pos_ratio):
+    """
+    It used to suppress the presence of a large number of negative prediction.
+    It works on image level not batch level.
+    For any example/image, it keeps all the positive predictions and
+     cut the number of negative predictions to make sure the ratio
+     between the negative examples and positive examples is no more
+     the given ratio for an image.
+
+    Args:
+        loss (N, num_priors): the loss for each example.
+        labels (N, num_priors): the labels.
+        neg_pos_ratio:  the ratio between the negative examples and positive examples.
+    """
+    pos_mask = labels > 0
+    num_pos = pos_mask.long().sum(dim=1, keepdim=True)
+    num_neg = num_pos * neg_pos_ratio
+
+    loss[pos_mask] = -math.inf
+    _, indexes = loss.sort(dim=1, descending=True)
+    _, orders = indexes.sort(dim=1)
+    neg_mask = orders < num_neg
+    return pos_mask | neg_mask
+
+
+def center_form_to_corner_form(locations):
+    return torch.cat([locations[..., :2] - locations[..., 2:]/2,
+                     locations[..., :2] + locations[..., 2:]/2], locations.dim() - 1) 
+
+
+def corner_form_to_center_form(boxes):
+    return torch.cat([
+        (boxes[..., :2] + boxes[..., 2:]) / 2,
+         boxes[..., 2:] - boxes[..., :2]
+    ], boxes.dim() - 1)
+
+
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    _, indexes = scores.sort(descending=True)
+    indexes = indexes[:candidate_size]
+    while len(indexes) > 0:
+        current = indexes[0]
+        picked.append(current.item())
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        indexes = indexes[1:]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            current_box.unsqueeze(0),
+        )
+        indexes = indexes[iou <= iou_threshold]
+
+    return box_scores[picked, :]
+
+
+def nms(box_scores, nms_method=None, score_threshold=None, iou_threshold=None,
+        sigma=0.5, top_k=-1, candidate_size=200):
+    if nms_method == "soft":
+        return soft_nms(box_scores, score_threshold, sigma, top_k)
+    else:
+        return hard_nms(box_scores, iou_threshold, top_k, candidate_size=candidate_size)
+
+
+def soft_nms(box_scores, score_threshold, sigma=0.5, top_k=-1):
+    """Soft NMS implementation.
+
+    References:
+        https://arxiv.org/abs/1704.04503
+        https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/cython_nms.pyx
+
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        score_threshold: boxes with scores less than value are not considered.
+        sigma: the parameter in score re-computation.
+            scores[i] = scores[i] * exp(-(iou_i)^2 / simga)
+        top_k: keep top_k results. If k <= 0, keep all the results.
+    Returns:
+         picked_box_scores (K, 5): results of NMS.
+    """
+    picked_box_scores = []
+    while box_scores.size(0) > 0:
+        max_score_index = torch.argmax(box_scores[:, 4])
+        cur_box_prob = torch.tensor(box_scores[max_score_index, :])
+        picked_box_scores.append(cur_box_prob)
+        if len(picked_box_scores) == top_k > 0 or box_scores.size(0) == 1:
+            break
+        cur_box = cur_box_prob[:-1]
+        box_scores[max_score_index, :] = box_scores[-1, :]
+        box_scores = box_scores[:-1, :]
+        ious = iou_of(cur_box.unsqueeze(0), box_scores[:, :-1])
+        box_scores[:, -1] = box_scores[:, -1] * torch.exp(-(ious * ious) / sigma)
+        box_scores = box_scores[box_scores[:, -1] > score_threshold, :]
+    if len(picked_box_scores) > 0:
+        return torch.stack(picked_box_scores)
+    else:
+        return torch.tensor([])
+
+
+
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/box_utils_numpy.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/box_utils_numpy.py
new file mode 100644
index 000000000..177456f2f
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/box_utils_numpy.py
@@ -0,0 +1,238 @@
+from .box_utils import SSDSpec
+
+from typing import List
+import itertools
+import math
+import numpy as np
+
+
+def generate_ssd_priors(specs: List[SSDSpec], image_size, clamp=True):
+    """Generate SSD Prior Boxes.
+
+    It returns the center, height and width of the priors. The values are relative to the image size
+    Args:
+        specs: SSDSpecs about the shapes of sizes of prior boxes. i.e.
+            specs = [
+                SSDSpec(38, 8, SSDBoxSizes(30, 60), [2]),
+                SSDSpec(19, 16, SSDBoxSizes(60, 111), [2, 3]),
+                SSDSpec(10, 32, SSDBoxSizes(111, 162), [2, 3]),
+                SSDSpec(5, 64, SSDBoxSizes(162, 213), [2, 3]),
+                SSDSpec(3, 100, SSDBoxSizes(213, 264), [2]),
+                SSDSpec(1, 300, SSDBoxSizes(264, 315), [2])
+            ]
+        image_size: image size.
+        clamp: if true, clamp the values to make fall between [0.0, 1.0]
+    Returns:
+        priors (num_priors, 4): The prior boxes represented as [[center_x, center_y, w, h]]. All the values
+            are relative to the image size.
+    """
+    priors = []
+    for spec in specs:
+        scale = image_size / spec.shrinkage
+        for j, i in itertools.product(range(spec.feature_map_size), repeat=2):
+            x_center = (i + 0.5) / scale
+            y_center = (j + 0.5) / scale
+
+            # small sized square box
+            size = spec.box_sizes.min
+            h = w = size / image_size
+            priors.append([
+                x_center,
+                y_center,
+                w,
+                h
+            ])
+
+            # big sized square box
+            size = math.sqrt(spec.box_sizes.max * spec.box_sizes.min)
+            h = w = size / image_size
+            priors.append([
+                x_center,
+                y_center,
+                w,
+                h
+            ])
+
+            # change h/w ratio of the small sized box
+            size = spec.box_sizes.min
+            h = w = size / image_size
+            for ratio in spec.aspect_ratios:
+                ratio = math.sqrt(ratio)
+                priors.append([
+                    x_center,
+                    y_center,
+                    w * ratio,
+                    h / ratio
+                ])
+                priors.append([
+                    x_center,
+                    y_center,
+                    w / ratio,
+                    h * ratio
+                ])
+
+    priors = np.array(priors, dtype=np.float32)
+    if clamp:
+        np.clip(priors, 0.0, 1.0, out=priors)
+    return priors
+
+
+def convert_locations_to_boxes(locations, priors, center_variance,
+                               size_variance):
+    """Convert regressional location results of SSD into boxes in the form of (center_x, center_y, h, w).
+
+    The conversion:
+        $$predicted\_center * center_variance = \frac {real\_center - prior\_center} {prior\_hw}$$
+        $$exp(predicted\_hw * size_variance) = \frac {real\_hw} {prior\_hw}$$
+    We do it in the inverse direction here.
+    Args:
+        locations (batch_size, num_priors, 4): the regression output of SSD. It will contain the outputs as well.
+        priors (num_priors, 4) or (batch_size/1, num_priors, 4): prior boxes.
+        center_variance: a float used to change the scale of center.
+        size_variance: a float used to change of scale of size.
+    Returns:
+        boxes:  priors: [[center_x, center_y, h, w]]. All the values
+            are relative to the image size.
+    """
+    # priors can have one dimension less.
+    if len(priors.shape) + 1 == len(locations.shape):
+        priors = np.expand_dims(priors, 0)
+    return np.concatenate([
+        locations[..., :2] * center_variance * priors[..., 2:] + priors[..., :2],
+        np.exp(locations[..., 2:] * size_variance) * priors[..., 2:]
+    ], axis=len(locations.shape) - 1)
+
+
+def convert_boxes_to_locations(center_form_boxes, center_form_priors, center_variance, size_variance):
+    # priors can have one dimension less
+    if len(center_form_priors.shape) + 1 == len(center_form_boxes.shape):
+        center_form_priors = np.expand_dims(center_form_priors, 0)
+    return np.concatenate([
+        (center_form_boxes[..., :2] - center_form_priors[..., :2]) / center_form_priors[..., 2:] / center_variance,
+        np.log(center_form_boxes[..., 2:] / center_form_priors[..., 2:]) / size_variance
+    ], axis=len(center_form_boxes.shape) - 1)
+
+
+def area_of(left_top, right_bottom):
+    """Compute the areas of rectangles given two corners.
+
+    Args:
+        left_top (N, 2): left top corner.
+        right_bottom (N, 2): right bottom corner.
+
+    Returns:
+        area (N): return the area.
+    """
+    hw = np.clip(right_bottom - left_top, 0.0, None)
+    return hw[..., 0] * hw[..., 1]
+
+
+def iou_of(boxes0, boxes1, eps=1e-5):
+    """Return intersection-over-union (Jaccard index) of boxes.
+
+    Args:
+        boxes0 (N, 4): ground truth boxes.
+        boxes1 (N or 1, 4): predicted boxes.
+        eps: a small number to avoid 0 as denominator.
+    Returns:
+        iou (N): IoU values.
+    """
+    overlap_left_top = np.maximum(boxes0[..., :2], boxes1[..., :2])
+    overlap_right_bottom = np.minimum(boxes0[..., 2:], boxes1[..., 2:])
+
+    overlap_area = area_of(overlap_left_top, overlap_right_bottom)
+    area0 = area_of(boxes0[..., :2], boxes0[..., 2:])
+    area1 = area_of(boxes1[..., :2], boxes1[..., 2:])
+    return overlap_area / (area0 + area1 - overlap_area + eps)
+
+
+def center_form_to_corner_form(locations):
+    return np.concatenate([locations[..., :2] - locations[..., 2:]/2,
+                     locations[..., :2] + locations[..., 2:]/2], len(locations.shape) - 1)
+
+
+def corner_form_to_center_form(boxes):
+    return np.concatenate([
+        (boxes[..., :2] + boxes[..., 2:]) / 2,
+         boxes[..., 2:] - boxes[..., :2]
+    ], len(boxes.shape) - 1)
+
+
+def hard_nms(box_scores, iou_threshold, top_k=-1, candidate_size=200):
+    """
+
+    Args:
+        box_scores (N, 5): boxes in corner-form and probabilities.
+        iou_threshold: intersection over union threshold.
+        top_k: keep top_k results. If k <= 0, keep all the results.
+        candidate_size: only consider the candidates with the highest scores.
+    Returns:
+         picked: a list of indexes of the kept boxes
+    """
+    scores = box_scores[:, -1]
+    boxes = box_scores[:, :-1]
+    picked = []
+    #_, indexes = scores.sort(descending=True)
+    indexes = np.argsort(scores)
+    #indexes = indexes[:candidate_size]
+    indexes = indexes[-candidate_size:]
+    while len(indexes) > 0:
+        #current = indexes[0]
+        current = indexes[-1]
+        picked.append(current)
+        if 0 < top_k == len(picked) or len(indexes) == 1:
+            break
+        current_box = boxes[current, :]
+        #indexes = indexes[1:]
+        indexes = indexes[:-1]
+        rest_boxes = boxes[indexes, :]
+        iou = iou_of(
+            rest_boxes,
+            np.expand_dims(current_box, axis=0),
+        )
+        indexes = indexes[iou <= iou_threshold]
+
+    return box_scores[picked, :]
+
+
+# def nms(box_scores, nms_method=None, score_threshold=None, iou_threshold=None,
+#         sigma=0.5, top_k=-1, candidate_size=200):
+#     if nms_method == "soft":
+#         return soft_nms(box_scores, score_threshold, sigma, top_k)
+#     else:
+#         return hard_nms(box_scores, iou_threshold, top_k, candidate_size=candidate_size)
+
+#
+# def soft_nms(box_scores, score_threshold, sigma=0.5, top_k=-1):
+#     """Soft NMS implementation.
+#
+#     References:
+#         https://arxiv.org/abs/1704.04503
+#         https://github.com/facebookresearch/Detectron/blob/master/detectron/utils/cython_nms.pyx
+#
+#     Args:
+#         box_scores (N, 5): boxes in corner-form and probabilities.
+#         score_threshold: boxes with scores less than value are not considered.
+#         sigma: the parameter in score re-computation.
+#             scores[i] = scores[i] * exp(-(iou_i)^2 / simga)
+#         top_k: keep top_k results. If k <= 0, keep all the results.
+#     Returns:
+#          picked_box_scores (K, 5): results of NMS.
+#     """
+#     picked_box_scores = []
+#     while box_scores.size(0) > 0:
+#         max_score_index = torch.argmax(box_scores[:, 4])
+#         cur_box_prob = torch.tensor(box_scores[max_score_index, :])
+#         picked_box_scores.append(cur_box_prob)
+#         if len(picked_box_scores) == top_k > 0 or box_scores.size(0) == 1:
+#             break
+#         cur_box = cur_box_prob[:-1]
+#         box_scores[max_score_index, :] = box_scores[-1, :]
+#         box_scores = box_scores[:-1, :]
+#         ious = iou_of(cur_box.unsqueeze(0), box_scores[:, :-1])
+#         box_scores[:, -1] = box_scores[:, -1] * torch.exp(-(ious * ious) / sigma)
+#         box_scores = box_scores[box_scores[:, -1] > score_threshold, :]
+#     if len(picked_box_scores) > 0:
+#         return torch.stack(picked_box_scores)
+#     else:
+#         return torch.tensor([])
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/measurements.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/measurements.py
new file mode 100644
index 000000000..5cc590c1d
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/measurements.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+
+def compute_average_precision(precision, recall):
+    """
+    It computes average precision based on the definition of Pascal Competition. It computes the under curve area
+    of precision and recall. Recall follows the normal definition. Precision is a variant.
+    pascal_precision[i] = typical_precision[i:].max()
+    """
+    # identical but faster version of new_precision[i] = old_precision[i:].max()
+    precision = np.concatenate([[0.0], precision, [0.0]])
+    for i in range(len(precision) - 1, 0, -1):
+        precision[i - 1] = np.maximum(precision[i - 1], precision[i])
+
+    # find the index where the value changes
+    recall = np.concatenate([[0.0], recall, [1.0]])
+    changing_points = np.where(recall[1:] != recall[:-1])[0]
+
+    # compute under curve area
+    areas = (recall[changing_points + 1] - recall[changing_points]) * precision[changing_points + 1]
+    return areas.sum()
+
+
+def compute_voc2007_average_precision(precision, recall):
+    ap = 0.
+    for t in np.arange(0., 1.1, 0.1):
+        if np.sum(recall >= t) == 0:
+            p = 0
+        else:
+            p = np.max(precision[recall >= t])
+        ap = ap + p / 11.
+    return ap
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/misc.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/misc.py
new file mode 100644
index 000000000..e79545853
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/misc.py
@@ -0,0 +1,45 @@
+import time
+import torch
+
+
+def str2bool(s):
+    return s.lower() in ('true', '1')
+
+
+class Timer:
+    def __init__(self):
+        self.clock = {}
+
+    def start(self, key="default"):
+        self.clock[key] = time.time()
+
+    def end(self, key="default"):
+        if key not in self.clock:
+            raise Exception(f"{key} is not in the clock.")
+        interval = time.time() - self.clock[key]
+        del self.clock[key]
+        return interval
+        
+
+def save_checkpoint(epoch, net_state_dict, optimizer_state_dict, best_score, checkpoint_path, model_path):
+    torch.save({
+        'epoch': epoch,
+        'model': net_state_dict,
+        'optimizer': optimizer_state_dict,
+        'best_score': best_score
+    }, checkpoint_path)
+    torch.save(net_state_dict, model_path)
+        
+        
+def load_checkpoint(checkpoint_path):
+    return torch.load(checkpoint_path)
+
+
+def freeze_net_layers(net):
+    for param in net.parameters():
+        param.requires_grad = False
+
+
+def store_labels(path, labels):
+    with open(path, "w") as f:
+        f.write("\n".join(labels))
diff --git a/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/model_book.py b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/model_book.py
new file mode 100644
index 000000000..b1e9d17e9
--- /dev/null
+++ b/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/vision/utils/model_book.py
@@ -0,0 +1,81 @@
+from collections import OrderedDict
+import torch.nn as nn
+
+
+class ModelBook:
+    """Maintain the mapping between modules and their paths.
+
+    Example:
+        book = ModelBook(model_ft)
+        for p, m in book.conv2d_modules():
+            print('path:', p, 'num of filters:', m.out_channels)
+            assert m is book.get_module(p)
+    """
+
+    def __init__(self, model):
+        self._model = model
+        self._modules = OrderedDict()
+        self._paths = OrderedDict()
+        path = []
+        self._construct(self._model, path)
+
+    def _construct(self, module, path):
+        if not module._modules:
+            return
+        for name, m in module._modules.items():
+            cur_path = tuple(path + [name])
+            self._paths[m] = cur_path
+            self._modules[cur_path] = m
+            self._construct(m, path + [name])
+
+    def conv2d_modules(self):
+        return self.modules(nn.Conv2d)
+
+    def linear_modules(self):
+        return self.modules(nn.Linear)
+
+    def modules(self, module_type=None):
+        for p, m in self._modules.items():
+            if not module_type or isinstance(m, module_type):
+                yield p, m
+
+    def num_of_conv2d_modules(self):
+        return self.num_of_modules(nn.Conv2d)
+
+    def num_of_conv2d_filters(self):
+        """Return the sum of out_channels of all conv2d layers.
+
+        Here we treat the sub weight with size of [in_channels, h, w] as a single filter.
+        """
+        num_filters = 0
+        for _, m in self.conv2d_modules():
+            num_filters += m.out_channels
+        return num_filters
+
+    def num_of_linear_modules(self):
+        return self.num_of_modules(nn.Linear)
+
+    def num_of_linear_filters(self):
+        num_filters = 0
+        for _, m in self.linear_modules():
+            num_filters += m.out_features
+        return num_filters
+
+    def num_of_modules(self, module_type=None):
+        num = 0
+        for p, m in self._modules.items():
+            if not module_type or isinstance(m, module_type):
+                num += 1
+        return num
+
+    def get_module(self, path):
+        return self._modules.get(path)
+
+    def get_path(self, module):
+        return self._paths.get(module)
+
+    def update(self, path, module):
+        old_module = self._modules[path]
+        del self._paths[old_module]
+        self._paths[module] = path
+        self._modules[path] = module
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/License.txt b/models/object_detection/pytorch/yolov4/inference/gpu/License.txt
new file mode 100644
index 000000000..d64569567
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/License.txt
@@ -0,0 +1,202 @@
+
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/models.py b/models/object_detection/pytorch/yolov4/inference/gpu/models.py
new file mode 100644
index 000000000..633a61491
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/models.py
@@ -0,0 +1,640 @@
+import argparse
+import torch
+import intel_extension_for_pytorch
+from torch import nn
+import torch.nn.functional as F
+from tool.torch_utils import *
+from tool.yolo_layer import YoloLayer
+
+from torch.jit._recursive import wrap_cpp_module
+from torch.quantization.quantize_jit import (
+    convert_jit,
+    prepare_jit,
+)
+from torch.quantization import default_qconfig
+import math
+
+torch._C._jit_set_profiling_mode(False)
+torch._C._jit_set_profiling_executor(False)
+
+parser = argparse.ArgumentParser(description='PyTorch ImageNet Training')
+parser.add_argument('-n', '--n_classes', required=True, type=int, help='n_classes')
+parser.add_argument('--weight', help='destination of weightfiles')
+parser.add_argument('-i', '--image', help='The address where the picture is stored')
+parser.add_argument('-e', '--height', required=True, type=int, help='Image height')
+parser.add_argument('-w', '--width', required=True,  type=int, help='Image width')
+parser.add_argument('-name', help='namefile lable_path')
+parser.add_argument('-d', '--datatype', default='fp32', choices=['int8', 'bf16', 'fp16', 'fp32', 'tp32'], 
+                    help='datatype select')
+parser.add_argument("--dummy",  default=0, type=int, help='use dummy data for '
+                    'benchmark training or val')
+parser.add_argument('--benchmark', default=0, type=int, help='for int8 benchmark '
+                    'performance, move H2D out of E2E time')
+parser.add_argument('-b', '--batch-size', default=1, type=int,
+                    metavar='N',
+                    help='mini-batch size (default: 1), this is the total '
+                         'batch size of all GPUs on the current node when '
+                         'using Data Parallel or Distributed Data Parallel')
+parser.add_argument('--iter', default=2, type=int, help='iteration number, default:2')
+parser.add_argument("--save", help='Path to save entile model')
+parser.add_argument("--load", help='Path to load entile model')
+parser.add_argument('--jit', default=-1, type=int,choices=[-1, 0, 1],
+                     help='Select run with jit or impe path, 0 : impe, 1 : jit')
+class Mish(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = x * (torch.tanh(torch.nn.functional.softplus(x)))
+        return x
+
+
+class Upsample(nn.Module):
+    def __init__(self):
+        super(Upsample, self).__init__()
+
+    def forward(self, x, target_size, inference=False):
+        assert (x.data.dim() == 4)
+        # _, _, tH, tW = target_size
+
+        ####This change is for better performance for inference on Pytorch.
+        #if inference:
+        #    #B = x.data.size(0)
+        #    #C = x.data.size(1)
+        #    #H = x.data.size(2)
+        #    #W = x.data.size(3)
+        #    print('upsample input == ', x.cpu())
+        #    print('target_size == ', target_size)
+        #    print('x size === ', x.shape)
+        #    y = x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\
+        #            expand(x.size(0), x.size(1), x.size(2), target_size[2] // x.size(2), x.size(3), target_size[3] // x.size(3)).\
+        #            contiguous().view(x.size(0), x.size(1), target_size[2], target_size[3])
+        #    print('upsample output == ', y.cpu())
+        #    return y
+        #else:
+        return F.interpolate(x,  size=(target_size[2], target_size[3]), scale_factor = None, mode='nearest')
+
+
+class Conv_Bn_Activation(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, stride, activation, bn=True, bias=False):
+        super().__init__()
+        pad = (kernel_size - 1) // 2
+
+        self.conv = nn.ModuleList()
+        if bias:
+            self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad))
+        else:
+            self.conv.append(nn.Conv2d(in_channels, out_channels, kernel_size, stride, pad, bias=False))
+        if bn:
+            self.conv.append(nn.BatchNorm2d(out_channels))
+        if activation == "mish":
+            self.conv.append(Mish())
+        elif activation == "relu":
+            self.conv.append(nn.ReLU(inplace=True))
+        elif activation == "leaky":
+            self.conv.append(nn.LeakyReLU(0.1, inplace=True))
+        elif activation == "linear":
+            pass
+        else:
+            print("activate error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
+                                                       sys._getframe().f_code.co_name, sys._getframe().f_lineno))
+
+    def forward(self, x):
+        for l in self.conv:
+            x = l(x)
+        return x
+
+
+class ResBlock(nn.Module):
+    """
+    Sequential residual blocks each of which consists of \
+    two convolution layers.
+    Args:
+        ch (int): number of input and output channels.
+        nblocks (int): number of residual blocks.
+        shortcut (bool): if True, residual tensor addition is enabled.
+    """
+
+    def __init__(self, ch, nblocks=1, shortcut=True):
+        super().__init__()
+        self.shortcut = shortcut
+        self.module_list = nn.ModuleList()
+        for i in range(nblocks):
+            resblock_one = nn.ModuleList()
+            resblock_one.append(Conv_Bn_Activation(ch, ch, 1, 1, 'mish'))
+            resblock_one.append(Conv_Bn_Activation(ch, ch, 3, 1, 'mish'))
+            self.module_list.append(resblock_one)
+
+    def forward(self, x):
+        for module in self.module_list:
+            h = x
+            for res in module:
+                h = res(h)
+            x = x + h if self.shortcut else h
+        return x
+
+
+class DownSample1(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(3, 32, 3, 1, 'mish')
+
+        self.conv2 = Conv_Bn_Activation(32, 64, 3, 2, 'mish')
+        self.conv3 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # [route]
+        # layers = -2
+        self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+
+        self.conv5 = Conv_Bn_Activation(64, 32, 1, 1, 'mish')
+        self.conv6 = Conv_Bn_Activation(32, 64, 3, 1, 'mish')
+        # [shortcut]
+        # from=-3
+        # activation = linear
+
+        self.conv7 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # [route]
+        # layers = -1, -7
+        self.conv8 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x2)
+        # route -2
+        x4 = self.conv4(x2)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        # shortcut -3
+        x6 = x6 + x4
+
+        x7 = self.conv7(x6)
+        # [route]
+        # layers = -1, -7
+        x7 = torch.cat([x7, x3], dim=1)
+        x8 = self.conv8(x7)
+        return x8
+
+
+class DownSample2(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(64, 128, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+        # r -2
+        self.conv3 = Conv_Bn_Activation(128, 64, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=64, nblocks=2)
+
+        # s -3
+        self.conv4 = Conv_Bn_Activation(64, 64, 1, 1, 'mish')
+        # r -1 -10
+        self.conv5 = Conv_Bn_Activation(128, 128, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample3(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(128, 256, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(256, 128, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(256, 128, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=128, nblocks=8)
+        self.conv4 = Conv_Bn_Activation(128, 128, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(256, 256, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample4(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(256, 512, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(512, 256, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(512, 256, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=256, nblocks=8)
+        self.conv4 = Conv_Bn_Activation(256, 256, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(512, 512, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class DownSample5(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.conv1 = Conv_Bn_Activation(512, 1024, 3, 2, 'mish')
+        self.conv2 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish')
+        self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'mish')
+
+        self.resblock = ResBlock(ch=512, nblocks=4)
+        self.conv4 = Conv_Bn_Activation(512, 512, 1, 1, 'mish')
+        self.conv5 = Conv_Bn_Activation(1024, 1024, 1, 1, 'mish')
+
+    def forward(self, input):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x1)
+
+        r = self.resblock(x3)
+        x4 = self.conv4(r)
+
+        x4 = torch.cat([x4, x2], dim=1)
+        x5 = self.conv5(x4)
+        return x5
+
+
+class Neck(nn.Module):
+    def __init__(self, inference=False):
+        super().__init__()
+        self.inference = inference
+
+        self.conv1 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv2 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv3 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        # SPP
+        self.maxpool1 = nn.MaxPool2d(kernel_size=5, stride=1, padding=5 // 2)
+        self.maxpool2 = nn.MaxPool2d(kernel_size=9, stride=1, padding=9 // 2)
+        self.maxpool3 = nn.MaxPool2d(kernel_size=13, stride=1, padding=13 // 2)
+
+        # R -1 -3 -5 -6
+        # SPP
+        self.conv4 = Conv_Bn_Activation(2048, 512, 1, 1, 'leaky')
+        self.conv5 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv6 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv7 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        # UP
+        self.upsample1 = Upsample()
+        # R 85
+        self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        # R -1 -3
+        self.conv9 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv10 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv11 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv12 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv13 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv14 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        # UP
+        self.upsample2 = Upsample()
+        # R 54
+        self.conv15 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        # R -1 -3
+        self.conv16 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        self.conv17 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv18 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+        self.conv19 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv20 = Conv_Bn_Activation(256, 128, 1, 1, 'leaky')
+
+    def forward(self, input, downsample4, downsample3, inference=False):
+        x1 = self.conv1(input)
+        x2 = self.conv2(x1)
+        x3 = self.conv3(x2)
+        # SPP
+        m1 = self.maxpool1(x3)
+        m2 = self.maxpool2(x3)
+        m3 = self.maxpool3(x3)
+        spp = torch.cat([m3, m2, m1, x3], dim=1)
+        # SPP end
+        x4 = self.conv4(spp)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        x7 = self.conv7(x6)
+        # UP
+        up = self.upsample1(x7, downsample4.size(), self.inference)
+        # R 85
+        x8 = self.conv8(downsample4)
+        # R -1 -3
+        x8 = torch.cat([x8, up], dim=1)
+
+        x9 = self.conv9(x8)
+        x10 = self.conv10(x9)
+        x11 = self.conv11(x10)
+        x12 = self.conv12(x11)
+        x13 = self.conv13(x12)
+        x14 = self.conv14(x13)
+
+        # UP
+        up = self.upsample2(x14, downsample3.size(), self.inference)
+        # R 54
+        x15 = self.conv15(downsample3)
+        # R -1 -3
+        x15 = torch.cat([x15, up], dim=1)
+
+        x16 = self.conv16(x15)
+        x17 = self.conv17(x16)
+        x18 = self.conv18(x17)
+        x19 = self.conv19(x18)
+        x20 = self.conv20(x19)
+        return x20, x13, x6
+
+
+class Yolov4Head(nn.Module):
+    def __init__(self, output_ch, n_classes, inference=False, benchmark=False):
+        super().__init__()
+        self.inference = inference
+        self.benchmark = benchmark
+
+        self.conv1 = Conv_Bn_Activation(128, 256, 3, 1, 'leaky')
+        self.conv2 = Conv_Bn_Activation(256, output_ch, 1, 1, 'linear', bn=False, bias=True)
+
+        self.yolo1 = YoloLayer(
+                                anchor_mask=[0, 1, 2], num_classes=n_classes,
+                                anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+                                num_anchors=9, stride=8)
+
+        # R -4
+        self.conv3 = Conv_Bn_Activation(128, 256, 3, 2, 'leaky')
+
+        # R -1 -16
+        self.conv4 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv5 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv6 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv7 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv8 = Conv_Bn_Activation(512, 256, 1, 1, 'leaky')
+        self.conv9 = Conv_Bn_Activation(256, 512, 3, 1, 'leaky')
+        self.conv10 = Conv_Bn_Activation(512, output_ch, 1, 1, 'linear', bn=False, bias=True)
+        
+        self.yolo2 = YoloLayer(
+                                anchor_mask=[3, 4, 5], num_classes=n_classes,
+                                anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+                                num_anchors=9, stride=16)
+
+        # R -4
+        self.conv11 = Conv_Bn_Activation(256, 512, 3, 2, 'leaky')
+
+        # R -1 -37
+        self.conv12 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv13 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv14 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv15 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv16 = Conv_Bn_Activation(1024, 512, 1, 1, 'leaky')
+        self.conv17 = Conv_Bn_Activation(512, 1024, 3, 1, 'leaky')
+        self.conv18 = Conv_Bn_Activation(1024, output_ch, 1, 1, 'linear', bn=False, bias=True)
+        
+        self.yolo3 = YoloLayer(
+                                anchor_mask=[6, 7, 8], num_classes=n_classes,
+                                anchors=[12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401],
+                                num_anchors=9, stride=32)
+
+    def forward(self, input1, input2, input3):
+        x1 = self.conv1(input1)
+        x2 = self.conv2(x1)
+
+        x3 = self.conv3(input1)
+        # R -1 -16
+        x3 = torch.cat([x3, input2], dim=1)
+        x4 = self.conv4(x3)
+        x5 = self.conv5(x4)
+        x6 = self.conv6(x5)
+        x7 = self.conv7(x6)
+        x8 = self.conv8(x7)
+        x9 = self.conv9(x8)
+        x10 = self.conv10(x9)
+
+        # R -4
+        x11 = self.conv11(x8)
+        # R -1 -37
+        x11 = torch.cat([x11, input3], dim=1)
+
+        x12 = self.conv12(x11)
+        x13 = self.conv13(x12)
+        x14 = self.conv14(x13)
+        x15 = self.conv15(x14)
+        x16 = self.conv16(x15)
+        x17 = self.conv17(x16)
+        x18 = self.conv18(x17)
+        
+        if self.inference:
+            if self.benchmark==0:
+                y1 = self.yolo1(x2)
+                y2 = self.yolo2(x10)
+                y3 = self.yolo3(x18)
+
+                return get_region_boxes([y1, y2, y3])
+            else:
+                # slicing to trigger dequantization
+                y1 = x2[:]
+                y2 = x10[:]
+                y3 = x18[:]
+                return y1, y2, y3
+        else:
+            return [x2, x10, x18]
+
+
+class Yolov4(nn.Module):
+    def __init__(self, yolov4conv137weight=None, n_classes=80, inference=False, benchmark=False):
+        super().__init__()
+
+        output_ch = (4 + 1 + n_classes) * 3
+
+        # backbone
+        self.down1 = DownSample1()
+        self.down2 = DownSample2()
+        self.down3 = DownSample3()
+        self.down4 = DownSample4()
+        self.down5 = DownSample5()
+        # neck
+        self.neek = Neck(inference)
+        # yolov4conv137
+        if yolov4conv137weight:
+            _model = nn.Sequential(self.down1, self.down2, self.down3, self.down4, self.down5, self.neek)
+            pretrained_dict = torch.load(yolov4conv137weight)
+
+            model_dict = _model.state_dict()
+            # 1. filter out unnecessary keys
+            pretrained_dict = {k1: v for (k, v), k1 in zip(pretrained_dict.items(), model_dict)}
+            # 2. overwrite entries in the existing state dict
+            model_dict.update(pretrained_dict)
+            _model.load_state_dict(model_dict)
+        
+        # head
+        self.head = Yolov4Head(output_ch, n_classes, inference, benchmark)
+
+
+    def forward(self, input):
+        d1 = self.down1(input)
+        d2 = self.down2(d1)
+        d3 = self.down3(d2)
+        d4 = self.down4(d3)
+        d5 = self.down5(d4)
+
+        x20, x13, x6 = self.neek(d5, d4, d3)
+
+        output = self.head(x20, x13, x6)
+        return output
+
+
+if __name__ == "__main__":
+    import argparse
+    import sys
+    import cv2
+    args = parser.parse_args()
+    n_classes = args.n_classes
+    weightfile = args.weight
+    imgfile = args.image
+    height = args.height
+    width = args.width
+    namesfile = None
+    if args.name:
+        namesfile = args.name
+    data_type = args.datatype
+    dummy = args.dummy
+    benchmark = args.benchmark
+    batch_size = args.batch_size
+    infer_iters = args.iter
+
+
+    import os
+    if dummy == 1 :
+        img = cv2.imread(os.path.join(os.getcwd(), "models/object_detection/pytorch/yolov4/inference/gpu/data/000000581918.jpg"))
+    else:
+        img = cv2.imread(imgfile)
+
+    if data_type == "int8" and args.jit == 0:
+        print("int8 not support impe path")
+        sys.exit(1)
+
+    # Inference input size is 416*416 does not mean training size is the same
+    # Training size could be 608*608 or even other sizes
+    # Optional inference sizes:
+    #   Hight in {320, 416, 512, 608, ... 320 + 96 * n}
+    #   Width in {320, 416, 512, 608, ... 320 + 96 * m}
+    sized = cv2.resize(img, (width, height))
+    sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB)
+
+    if args.load:
+        load_path = args.load
+        if args.jit == -1:
+            if data_type in ("fp16", "int8"):
+                model = torch.jit.load(load_path)
+            if data_type == "fp32":
+                model = torch.load(load_path)
+        elif args.jit == 0:
+            model = torch.load(load_path)
+        elif args.jit == 1:
+            model = torch.jit.load(load_path)
+        else:
+            print("invalid argument")
+            sys.exit(1)
+    else:
+        model = Yolov4(yolov4conv137weight=None, n_classes=n_classes, inference=True, benchmark=benchmark)
+        model.eval()
+
+        model = model.to('xpu')
+        pretrained_dict = torch.load(weightfile, map_location=torch.device('xpu'))
+        model.load_state_dict(pretrained_dict)
+
+        if args.jit == 1 or (args.jit == -1 and (data_type in ("fp16", "int8"))):
+            print("JIT running ... ")
+            img_jit = sized
+            if type(img_jit) == np.ndarray and len(img_jit.shape) == 3:  # cv2 image
+                img_jit = torch.from_numpy(img_jit.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
+            elif type(img_jit) == np.ndarray and len(img_jit.shape) == 4:
+                img_jit = torch.from_numpy(img_jit.transpose(0, 3, 1, 2)).float().div(255.0)
+            else:
+                print("unknow image type")
+                exit(-1)
+            img_jit = torch.autograd.Variable(img_jit)
+            img_jit = img_jit.to('xpu')
+
+            if data_type == "fp16":
+                img_jit = img_jit.half()
+            if benchmark == 1:
+                modelJit = torch.jit.trace(model, img_jit, check_trace=False)
+            else:
+                modelJit = torch.jit.trace(model, img_jit)
+            model = wrap_cpp_module(torch._C._jit_pass_fold_convbn(modelJit._c))
+
+            if data_type == 'int8':
+                print("Calibration for INT8 ... ")
+                with torch.no_grad():
+                    qconfig_s8 = torch.quantization.QConfig(
+                        activation=torch.quantization.observer.MinMaxObserver.with_args(
+                            qscheme=torch.per_tensor_symmetric,
+                            reduce_range=False,
+                            dtype=torch.qint8
+                        ),
+                        weight=torch.quantization.default_weight_observer
+                    )
+                    model = prepare_jit(model, {'': qconfig_s8}, True)
+
+                    model(img_jit)
+
+                    model = convert_jit(model, True)
+                    # print(model.graph_for(img_jit))
+    from tool.utils import load_class_names, plot_boxes_cv2
+    from tool.torch_utils import do_detect
+
+    if args.jit == 0:
+        if data_type == 'fp16':
+            model = model.half()
+
+
+    total_latency = 0
+    perf_start_iter = 1
+    if (benchmark == 1 and infer_iters >= 500):
+        perf_start_iter = math.floor(infer_iters * 0.7)
+    for i in range(infer_iters):  # This 'for' loop is for speed check
+                        # Because the first iteration is usually longer
+        with torch.inference_mode():
+            print("Iteration: ",i)
+            boxes, latency = do_detect(model, sized, 0.4, 0.6, i, dummy, batch_size, width, height, data_type, benchmark)
+
+            if i >= perf_start_iter:
+                total_latency += latency
+    avg_latency = total_latency / (infer_iters - perf_start_iter)
+    print('FPS : %f' % (batch_size / avg_latency))
+    if benchmark == 0:
+        if namesfile == None:
+            if n_classes == 20:
+                namesfile = 'data/voc.names'
+            elif n_classes == 80:
+                namesfile = 'data/coco.names'
+            else:
+                print("please give namefile")
+
+        class_names = load_class_names(namesfile)
+        plot_boxes_cv2(img, boxes[0], 'predictions.jpg', class_names)
+    
+    if args.save:
+        store_path = args.save
+        if args.jit == -1:
+            if data_type in ("fp16", "int8"):
+                torch.jit.save(model, store_path)
+            if data_type == "fp32":
+                torch.save(model, store_path)
+        elif args.jit == 0:
+            torch.save(model, store_path)
+        else:
+            torch.jit.save(model, store_path)
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/__init__.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/camera.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/camera.py
new file mode 100644
index 000000000..d3692b68c
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/camera.py
@@ -0,0 +1,77 @@
+# -*- coding: utf-8 -*-
+'''
+@Time          : 2020/04/26 15:48
+@Author        : Tianxiaomo
+@File          : camera.py
+@Noice         :
+@Modificattion :
+    @Author    :
+    @Time      :
+    @Detail    :
+
+'''
+from __future__ import division
+import cv2
+from tool.darknet2pytorch import Darknet
+import argparse
+from tool.utils import *
+from tool.torch_utils import *
+
+
+def arg_parse():
+    """
+    Parse arguements to the detect module
+
+    """
+
+    parser = argparse.ArgumentParser(description='YOLO v3 Cam Demo')
+    parser.add_argument("--confidence", dest="confidence", help="Object Confidence to filter predictions", default=0.25)
+    parser.add_argument("--nms_thresh", dest="nms_thresh", help="NMS Threshhold", default=0.4)
+    parser.add_argument("--reso", dest='reso', help=
+    "Input resolution of the network. Increase to increase accuracy. Decrease to increase speed",
+                        default="160", type=str)
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    cfgfile = "cfg/yolov4.cfg"
+    weightsfile = "weight/yolov4.weights"
+
+    args = arg_parse()
+    confidence = float(args.confidence)
+    nms_thesh = float(args.nms_thresh)
+    CUDA = torch.cuda.is_available()
+    num_classes = 80
+    bbox_attrs = 5 + num_classes
+    class_names = load_class_names("data/coco.names")
+
+    model = Darknet(cfgfile)
+    model.load_weights(weightsfile)
+
+    if CUDA:
+        model.cuda()
+
+    model.eval()
+    cap = cv2.VideoCapture(0)
+
+    assert cap.isOpened(), 'Cannot capture source'
+
+    frames = 0
+    start = time.time()
+    while cap.isOpened():
+        ret, frame = cap.read()
+        if ret:
+            sized = cv2.resize(frame, (model.width, model.height))
+            sized = cv2.cvtColor(sized, cv2.COLOR_BGR2RGB)
+            boxes = do_detect(model, sized, 0.5, 0.4, CUDA)
+
+            orig_im = plot_boxes_cv2(frame, boxes, class_names=class_names)
+
+            cv2.imshow("frame", orig_im)
+            key = cv2.waitKey(1)
+            if key & 0xFF == ord('q'):
+                break
+            frames += 1
+            print("FPS of the video is {:5.2f}".format(frames / (time.time() - start)))
+        else:
+            break
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/coco_annotation.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/coco_annotation.py
new file mode 100644
index 000000000..01d72021c
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/coco_annotation.py
@@ -0,0 +1,73 @@
+# -*- coding: utf-8 -*-
+'''
+@Time          : 2020/05/08 11:45
+@Author        : Tianxiaomo
+@File          : coco_annotatin.py
+@Noice         :
+@Modificattion :
+    @Author    :
+    @Time      :
+    @Detail    :
+
+'''
+import json
+from collections import defaultdict
+from tqdm import tqdm
+import os
+
+"""hyper parameters"""
+json_file_path = 'E:/Dataset/mscoco2017/annotations/instances_train2017.json'
+images_dir_path = 'mscoco2017/train2017/'
+output_path = '../data/val.txt'
+
+"""load json file"""
+name_box_id = defaultdict(list)
+id_name = dict()
+with open(json_file_path, encoding='utf-8') as f:
+    data = json.load(f)
+
+"""generate labels"""
+images = data['images']
+annotations = data['annotations']
+for ant in tqdm(annotations):
+    id = ant['image_id']
+    # name = os.path.join(images_dir_path, images[id]['file_name'])
+    name = os.path.join(images_dir_path, '{:012d}.jpg'.format(id))
+    cat = ant['category_id']
+
+    if cat >= 1 and cat <= 11:
+        cat = cat - 1
+    elif cat >= 13 and cat <= 25:
+        cat = cat - 2
+    elif cat >= 27 and cat <= 28:
+        cat = cat - 3
+    elif cat >= 31 and cat <= 44:
+        cat = cat - 5
+    elif cat >= 46 and cat <= 65:
+        cat = cat - 6
+    elif cat == 67:
+        cat = cat - 7
+    elif cat == 70:
+        cat = cat - 9
+    elif cat >= 72 and cat <= 82:
+        cat = cat - 10
+    elif cat >= 84 and cat <= 90:
+        cat = cat - 11
+
+    name_box_id[name].append([ant['bbox'], cat])
+
+"""write to txt"""
+with open(output_path, 'w') as f:
+    for key in tqdm(name_box_id.keys()):
+        f.write(key)
+        box_infos = name_box_id[key]
+        for info in box_infos:
+            x_min = int(info[0][0])
+            y_min = int(info[0][1])
+            x_max = x_min + int(info[0][2])
+            y_max = y_min + int(info[0][3])
+
+            box_info = " %d,%d,%d,%d,%d" % (
+                x_min, y_min, x_max, y_max, int(info[1]))
+            f.write(box_info)
+        f.write('\n')
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/config.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/config.py
new file mode 100644
index 000000000..613166eb9
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/config.py
@@ -0,0 +1,247 @@
+import torch
+from tool.torch_utils import convert2cpu
+
+
+def parse_cfg(cfgfile):
+    blocks = []
+    fp = open(cfgfile, 'r')
+    block = None
+    line = fp.readline()
+    while line != '':
+        line = line.rstrip()
+        if line == '' or line[0] == '#':
+            line = fp.readline()
+            continue
+        elif line[0] == '[':
+            if block:
+                blocks.append(block)
+            block = dict()
+            block['type'] = line.lstrip('[').rstrip(']')
+            # set default value
+            if block['type'] == 'convolutional':
+                block['batch_normalize'] = 0
+        else:
+            key, value = line.split('=')
+            key = key.strip()
+            if key == 'type':
+                key = '_type'
+            value = value.strip()
+            block[key] = value
+        line = fp.readline()
+
+    if block:
+        blocks.append(block)
+    fp.close()
+    return blocks
+
+
+def print_cfg(blocks):
+    print('layer     filters    size              input                output');
+    prev_width = 416
+    prev_height = 416
+    prev_filters = 3
+    out_filters = []
+    out_widths = []
+    out_heights = []
+    ind = -2
+    for block in blocks:
+        ind = ind + 1
+        if block['type'] == 'net':
+            prev_width = int(block['width'])
+            prev_height = int(block['height'])
+            continue
+        elif block['type'] == 'convolutional':
+            filters = int(block['filters'])
+            kernel_size = int(block['size'])
+            stride = int(block['stride'])
+            is_pad = int(block['pad'])
+            pad = (kernel_size - 1) // 2 if is_pad else 0
+            width = (prev_width + 2 * pad - kernel_size) // stride + 1
+            height = (prev_height + 2 * pad - kernel_size) // stride + 1
+            print('%5d %-6s %4d  %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'conv', filters, kernel_size, kernel_size, stride, prev_width, prev_height, prev_filters, width,
+                height, filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'maxpool':
+            pool_size = int(block['size'])
+            stride = int(block['stride'])
+            width = prev_width // stride
+            height = prev_height // stride
+            print('%5d %-6s       %d x %d / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'max', pool_size, pool_size, stride, prev_width, prev_height, prev_filters, width, height,
+                filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'avgpool':
+            width = 1
+            height = 1
+            print('%5d %-6s                   %3d x %3d x%4d   ->  %3d' % (
+                ind, 'avg', prev_width, prev_height, prev_filters, prev_filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'softmax':
+            print('%5d %-6s                                    ->  %3d' % (ind, 'softmax', prev_filters))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'cost':
+            print('%5d %-6s                                     ->  %3d' % (ind, 'cost', prev_filters))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'reorg':
+            stride = int(block['stride'])
+            filters = stride * stride * prev_filters
+            width = prev_width // stride
+            height = prev_height // stride
+            print('%5d %-6s             / %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'reorg', stride, prev_width, prev_height, prev_filters, width, height, filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'upsample':
+            stride = int(block['stride'])
+            filters = prev_filters
+            width = prev_width * stride
+            height = prev_height * stride
+            print('%5d %-6s           * %d   %3d x %3d x%4d   ->   %3d x %3d x%4d' % (
+                ind, 'upsample', stride, prev_width, prev_height, prev_filters, width, height, filters))
+            prev_width = width
+            prev_height = height
+            prev_filters = filters
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'route':
+            layers = block['layers'].split(',')
+            layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+            if len(layers) == 1:
+                print('%5d %-6s %d' % (ind, 'route', layers[0]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                prev_filters = out_filters[layers[0]]
+            elif len(layers) == 2:
+                print('%5d %-6s %d %d' % (ind, 'route', layers[0], layers[1]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                assert (prev_width == out_widths[layers[1]])
+                assert (prev_height == out_heights[layers[1]])
+                prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
+            elif len(layers) == 4:
+                print('%5d %-6s %d %d %d %d' % (ind, 'route', layers[0], layers[1], layers[2], layers[3]))
+                prev_width = out_widths[layers[0]]
+                prev_height = out_heights[layers[0]]
+                assert (prev_width == out_widths[layers[1]] == out_widths[layers[2]] == out_widths[layers[3]])
+                assert (prev_height == out_heights[layers[1]] == out_heights[layers[2]] == out_heights[layers[3]])
+                prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + out_filters[
+                    layers[3]]
+            else:
+                print("route error !!! {} {} {}".format(sys._getframe().f_code.co_filename,
+                                                        sys._getframe().f_code.co_name, sys._getframe().f_lineno))
+
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] in ['region', 'yolo']:
+            print('%5d %-6s' % (ind, 'detection'))
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'shortcut':
+            from_id = int(block['from'])
+            from_id = from_id if from_id > 0 else from_id + ind
+            print('%5d %-6s %d' % (ind, 'shortcut', from_id))
+            prev_width = out_widths[from_id]
+            prev_height = out_heights[from_id]
+            prev_filters = out_filters[from_id]
+            out_widths.append(prev_width)
+            out_heights.append(prev_height)
+            out_filters.append(prev_filters)
+        elif block['type'] == 'connected':
+            filters = int(block['output'])
+            print('%5d %-6s                            %d  ->  %3d' % (ind, 'connected', prev_filters, filters))
+            prev_filters = filters
+            out_widths.append(1)
+            out_heights.append(1)
+            out_filters.append(prev_filters)
+        else:
+            print('unknown type %s' % (block['type']))
+
+
+def load_conv(buf, start, conv_model):
+    num_w = conv_model.weight.numel()
+    num_b = conv_model.bias.numel()
+    conv_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
+    start = start + num_w
+    return start
+
+
+def save_conv(fp, conv_model):
+    conv_model.bias.data.numpy().tofile(fp)
+    conv_model.weight.data.numpy().tofile(fp)
+
+
+def load_conv_bn(buf, start, conv_model, bn_model):
+    num_w = conv_model.weight.numel()
+    num_b = bn_model.bias.numel()
+    bn_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    bn_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    bn_model.running_mean.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    bn_model.running_var.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    conv_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]).reshape(conv_model.weight.data.shape));
+    start = start + num_w
+    return start
+
+
+def save_conv_bn(fp, conv_model, bn_model):
+    bn_model.bias.data.numpy().tofile(fp)
+    bn_model.weight.data.numpy().tofile(fp)
+    bn_model.running_mean.numpy().tofile(fp)
+    bn_model.running_var.numpy().tofile(fp)
+    conv_model.weight.data.numpy().tofile(fp)
+
+
+def load_fc(buf, start, fc_model):
+    num_w = fc_model.weight.numel()
+    num_b = fc_model.bias.numel()
+    fc_model.bias.data.copy_(torch.from_numpy(buf[start:start + num_b]));
+    start = start + num_b
+    fc_model.weight.data.copy_(torch.from_numpy(buf[start:start + num_w]));
+    start = start + num_w
+    return start
+
+
+def save_fc(fp, fc_model):
+    fc_model.bias.data.numpy().tofile(fp)
+    fc_model.weight.data.numpy().tofile(fp)
+
+
+if __name__ == '__main__':
+    import sys
+
+    blocks = parse_cfg('cfg/yolo.cfg')
+    if len(sys.argv) == 2:
+        blocks = parse_cfg(sys.argv[1])
+    print_cfg(blocks)
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/darknet2onnx.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/darknet2onnx.py
new file mode 100644
index 000000000..5c8c8e23b
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/darknet2onnx.py
@@ -0,0 +1,74 @@
+import sys
+import torch
+from tool.darknet2pytorch import Darknet
+
+
+def transform_to_onnx(cfgfile, weightfile, batch_size=1):
+    model = Darknet(cfgfile)
+
+    model.print_network()
+    model.load_weights(weightfile)
+    print('Loading weights from %s... Done!' % (weightfile))
+
+    dynamic = False
+    if batch_size <= 0:
+        dynamic = True
+
+    input_names = ["input"]
+    output_names = ['boxes', 'confs']
+
+    if dynamic:
+        x = torch.randn((1, 3, model.height, model.width), requires_grad=True)
+        onnx_file_name = "yolov4_-1_3_{}_{}_dynamic.onnx".format(model.height, model.width)
+        dynamic_axes = {"input": {0: "batch_size"}, "boxes": {0: "batch_size"}, "confs": {0: "batch_size"}}
+        # Export the model
+        print('Export the onnx model ...')
+        torch.onnx.export(model,
+                          x,
+                          onnx_file_name,
+                          export_params=True,
+                          opset_version=11,
+                          do_constant_folding=True,
+                          input_names=input_names, output_names=output_names,
+                          dynamic_axes=dynamic_axes)
+
+        print('Onnx model exporting done')
+        return onnx_file_name
+
+    else:
+        x = torch.randn((batch_size, 3, model.height, model.width), requires_grad=True)
+        onnx_file_name = "yolov4_{}_3_{}_{}_static.onnx".format(batch_size, model.height, model.width)
+        torch.onnx.export(model,
+                          x,
+                          onnx_file_name,
+                          export_params=True,
+                          opset_version=11,
+                          do_constant_folding=True,
+                          input_names=input_names, output_names=output_names,
+                          dynamic_axes=None)
+
+        print('Onnx model exporting done')
+        return onnx_file_name
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 3:
+        cfgfile = sys.argv[1]
+        weightfile = sys.argv[2]
+        transform_to_onnx(cfgfile, weightfile)
+    elif len(sys.argv) == 4:
+        cfgfile = sys.argv[1]
+        weightfile = sys.argv[2]
+        batch_size = int(sys.argv[3])
+        transform_to_onnx(cfgfile, weightfile, batch_size)
+    elif len(sys.argv) == 5:
+        cfgfile = sys.argv[1]
+        weightfile = sys.argv[2]
+        batch_size = int(sys.argv[3])
+        dynamic = True if sys.argv[4] == 'True' else False
+        transform_to_onnx(cfgfile, weightfile, batch_size, dynamic)
+    else:
+        print('Please execute this script this way:\n')
+        print('  python darknet2onnx.py <cfgFile> <weightFile>')
+        print('or')
+        print('  python darknet2onnx.py <cfgFile> <weightFile> <batchSize>')
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/darknet2pytorch.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/darknet2pytorch.py
new file mode 100644
index 000000000..265b20e49
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/darknet2pytorch.py
@@ -0,0 +1,514 @@
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tool.region_loss import RegionLoss
+from tool.yolo_layer import YoloLayer
+from tool.config import *
+from tool.torch_utils import *
+
+
+class Mish(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x):
+        x = x * (torch.tanh(torch.nn.functional.softplus(x)))
+        return x
+
+
+class MaxPoolDark(nn.Module):
+    def __init__(self, size=2, stride=1):
+        super(MaxPoolDark, self).__init__()
+        self.size = size
+        self.stride = stride
+
+    def forward(self, x):
+        '''
+        darknet output_size = (input_size + p - k) / s +1
+        p : padding = k - 1
+        k : size
+        s : stride
+        torch output_size = (input_size + 2*p -k) / s +1
+        p : padding = k//2
+        '''
+        p = self.size // 2
+        if ((x.shape[2] - 1) // self.stride) != ((x.shape[2] + 2 * p - self.size) // self.stride):
+            padding1 = (self.size - 1) // 2
+            padding2 = padding1 + 1
+        else:
+            padding1 = (self.size - 1) // 2
+            padding2 = padding1
+        if ((x.shape[3] - 1) // self.stride) != ((x.shape[3] + 2 * p - self.size) // self.stride):
+            padding3 = (self.size - 1) // 2
+            padding4 = padding3 + 1
+        else:
+            padding3 = (self.size - 1) // 2
+            padding4 = padding3
+        x = F.max_pool2d(F.pad(x, (padding3, padding4, padding1, padding2), mode='replicate'),
+                         self.size, stride=self.stride)
+        return x
+
+
+class Upsample_expand(nn.Module):
+    def __init__(self, stride=2):
+        super(Upsample_expand, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        assert (x.data.dim() == 4)
+        
+        x = x.view(x.size(0), x.size(1), x.size(2), 1, x.size(3), 1).\
+            expand(x.size(0), x.size(1), x.size(2), self.stride, x.size(3), self.stride).contiguous().\
+            view(x.size(0), x.size(1), x.size(2) * self.stride, x.size(3) * self.stride)
+
+        return x
+
+
+class Upsample_interpolate(nn.Module):
+    def __init__(self, stride):
+        super(Upsample_interpolate, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        assert (x.data.dim() == 4)
+
+        out = F.interpolate(x, size=(x.size(2) * self.stride, x.size(3) * self.stride), mode='nearest')
+        return out
+
+
+class Reorg(nn.Module):
+    def __init__(self, stride=2):
+        super(Reorg, self).__init__()
+        self.stride = stride
+
+    def forward(self, x):
+        stride = self.stride
+        assert (x.data.dim() == 4)
+        B = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        assert (H % stride == 0)
+        assert (W % stride == 0)
+        ws = stride
+        hs = stride
+        x = x.view(B, C, H / hs, hs, W / ws, ws).transpose(3, 4).contiguous()
+        x = x.view(B, C, H / hs * W / ws, hs * ws).transpose(2, 3).contiguous()
+        x = x.view(B, C, hs * ws, H / hs, W / ws).transpose(1, 2).contiguous()
+        x = x.view(B, hs * ws * C, H / hs, W / ws)
+        return x
+
+
+class GlobalAvgPool2d(nn.Module):
+    def __init__(self):
+        super(GlobalAvgPool2d, self).__init__()
+
+    def forward(self, x):
+        N = x.data.size(0)
+        C = x.data.size(1)
+        H = x.data.size(2)
+        W = x.data.size(3)
+        x = F.avg_pool2d(x, (H, W))
+        x = x.view(N, C)
+        return x
+
+
+# for route and shortcut
+class EmptyModule(nn.Module):
+    def __init__(self):
+        super(EmptyModule, self).__init__()
+
+    def forward(self, x):
+        return x
+
+
+# support route shortcut and reorg
+class Darknet(nn.Module):
+    def __init__(self, cfgfile, inference=False):
+        super(Darknet, self).__init__()
+        self.inference = inference
+        self.training = not self.inference
+
+        self.blocks = parse_cfg(cfgfile)
+        self.width = int(self.blocks[0]['width'])
+        self.height = int(self.blocks[0]['height'])
+
+        self.models = self.create_network(self.blocks)  # merge conv, bn,leaky
+        self.loss = self.models[len(self.models) - 1]
+
+        if self.blocks[(len(self.blocks) - 1)]['type'] == 'region':
+            self.anchors = self.loss.anchors
+            self.num_anchors = self.loss.num_anchors
+            self.anchor_step = self.loss.anchor_step
+            self.num_classes = self.loss.num_classes
+
+        self.header = torch.IntTensor([0, 0, 0, 0])
+        self.seen = 0
+
+    def forward(self, x):
+        ind = -2
+        self.loss = None
+        outputs = dict()
+        out_boxes = []
+        for block in self.blocks:
+            ind = ind + 1
+            # if ind > 0:
+            #    return x
+
+            if block['type'] == 'net':
+                continue
+            elif block['type'] in ['convolutional', 'maxpool', 'reorg', 'upsample', 'avgpool', 'softmax', 'connected']:
+                x = self.models[ind](x)
+                outputs[ind] = x
+            elif block['type'] == 'route':
+                layers = block['layers'].split(',')
+                layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+                if len(layers) == 1:
+                    if 'groups' not in block.keys() or int(block['groups']) == 1:
+                        x = outputs[layers[0]]
+                        outputs[ind] = x
+                    else:
+                        groups = int(block['groups'])
+                        group_id = int(block['group_id'])
+                        _, b, _, _ = outputs[layers[0]].shape
+                        x = outputs[layers[0]][:, b // groups * group_id:b // groups * (group_id + 1)]
+                        outputs[ind] = x
+                elif len(layers) == 2:
+                    x1 = outputs[layers[0]]
+                    x2 = outputs[layers[1]]
+                    x = torch.cat((x1, x2), 1)
+                    outputs[ind] = x
+                elif len(layers) == 4:
+                    x1 = outputs[layers[0]]
+                    x2 = outputs[layers[1]]
+                    x3 = outputs[layers[2]]
+                    x4 = outputs[layers[3]]
+                    x = torch.cat((x1, x2, x3, x4), 1)
+                    outputs[ind] = x
+                else:
+                    print("rounte number > 2 ,is {}".format(len(layers)))
+
+            elif block['type'] == 'shortcut':
+                from_layer = int(block['from'])
+                activation = block['activation']
+                from_layer = from_layer if from_layer > 0 else from_layer + ind
+                x1 = outputs[from_layer]
+                x2 = outputs[ind - 1]
+                x = x1 + x2
+                if activation == 'leaky':
+                    x = F.leaky_relu(x, 0.1, inplace=True)
+                elif activation == 'relu':
+                    x = F.relu(x, inplace=True)
+                outputs[ind] = x
+            elif block['type'] == 'region':
+                continue
+                if self.loss:
+                    self.loss = self.loss + self.models[ind](x)
+                else:
+                    self.loss = self.models[ind](x)
+                outputs[ind] = None
+            elif block['type'] == 'yolo':
+                # if self.training:
+                #     pass
+                # else:
+                #     boxes = self.models[ind](x)
+                #     out_boxes.append(boxes)
+                boxes = self.models[ind](x)
+                out_boxes.append(boxes)
+            elif block['type'] == 'cost':
+                continue
+            else:
+                print('unknown type %s' % (block['type']))
+
+        if self.training:
+            return out_boxes
+        else:
+            return get_region_boxes(out_boxes)
+
+    def print_network(self):
+        print_cfg(self.blocks)
+
+    def create_network(self, blocks):
+        models = nn.ModuleList()
+
+        prev_filters = 3
+        out_filters = []
+        prev_stride = 1
+        out_strides = []
+        conv_id = 0
+        for block in blocks:
+            if block['type'] == 'net':
+                prev_filters = int(block['channels'])
+                continue
+            elif block['type'] == 'convolutional':
+                conv_id = conv_id + 1
+                batch_normalize = int(block['batch_normalize'])
+                filters = int(block['filters'])
+                kernel_size = int(block['size'])
+                stride = int(block['stride'])
+                is_pad = int(block['pad'])
+                pad = (kernel_size - 1) // 2 if is_pad else 0
+                activation = block['activation']
+                model = nn.Sequential()
+                if batch_normalize:
+                    model.add_module('conv{0}'.format(conv_id),
+                                     nn.Conv2d(prev_filters, filters, kernel_size, stride, pad, bias=False))
+                    model.add_module('bn{0}'.format(conv_id), nn.BatchNorm2d(filters))
+                    # model.add_module('bn{0}'.format(conv_id), BN2d(filters))
+                else:
+                    model.add_module('conv{0}'.format(conv_id),
+                                     nn.Conv2d(prev_filters, filters, kernel_size, stride, pad))
+                if activation == 'leaky':
+                    model.add_module('leaky{0}'.format(conv_id), nn.LeakyReLU(0.1, inplace=True))
+                elif activation == 'relu':
+                    model.add_module('relu{0}'.format(conv_id), nn.ReLU(inplace=True))
+                elif activation == 'mish':
+                    model.add_module('mish{0}'.format(conv_id), Mish())
+                else:
+                    print("convalution havn't activate {}".format(activation))
+
+                prev_filters = filters
+                out_filters.append(prev_filters)
+                prev_stride = stride * prev_stride
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'maxpool':
+                pool_size = int(block['size'])
+                stride = int(block['stride'])
+                if stride == 1 and pool_size % 2:
+                    # You can use Maxpooldark instead, here is convenient to convert onnx.
+                    # Example: [maxpool] size=3 stride=1
+                    model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=pool_size // 2)
+                elif stride == pool_size:
+                    # You can use Maxpooldark instead, here is convenient to convert onnx.
+                    # Example: [maxpool] size=2 stride=2
+                    model = nn.MaxPool2d(kernel_size=pool_size, stride=stride, padding=0)
+                else:
+                    model = MaxPoolDark(pool_size, stride)
+                out_filters.append(prev_filters)
+                prev_stride = stride * prev_stride
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'avgpool':
+                model = GlobalAvgPool2d()
+                out_filters.append(prev_filters)
+                models.append(model)
+            elif block['type'] == 'softmax':
+                model = nn.Softmax()
+                out_strides.append(prev_stride)
+                out_filters.append(prev_filters)
+                models.append(model)
+            elif block['type'] == 'cost':
+                if block['_type'] == 'sse':
+                    model = nn.MSELoss(reduction='mean')
+                elif block['_type'] == 'L1':
+                    model = nn.L1Loss(reduction='mean')
+                elif block['_type'] == 'smooth':
+                    model = nn.SmoothL1Loss(reduction='mean')
+                out_filters.append(1)
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'reorg':
+                stride = int(block['stride'])
+                prev_filters = stride * stride * prev_filters
+                out_filters.append(prev_filters)
+                prev_stride = prev_stride * stride
+                out_strides.append(prev_stride)
+                models.append(Reorg(stride))
+            elif block['type'] == 'upsample':
+                stride = int(block['stride'])
+                out_filters.append(prev_filters)
+                prev_stride = prev_stride // stride
+                out_strides.append(prev_stride)
+
+                models.append(Upsample_expand(stride))
+                # models.append(Upsample_interpolate(stride))
+
+            elif block['type'] == 'route':
+                layers = block['layers'].split(',')
+                ind = len(models)
+                layers = [int(i) if int(i) > 0 else int(i) + ind for i in layers]
+                if len(layers) == 1:
+                    if 'groups' not in block.keys() or int(block['groups']) == 1:
+                        prev_filters = out_filters[layers[0]]
+                        prev_stride = out_strides[layers[0]]
+                    else:
+                        prev_filters = out_filters[layers[0]] // int(block['groups'])
+                        prev_stride = out_strides[layers[0]] // int(block['groups'])
+                elif len(layers) == 2:
+                    assert (layers[0] == ind - 1 or layers[1] == ind - 1)
+                    prev_filters = out_filters[layers[0]] + out_filters[layers[1]]
+                    prev_stride = out_strides[layers[0]]
+                elif len(layers) == 4:
+                    assert (layers[0] == ind - 1)
+                    prev_filters = out_filters[layers[0]] + out_filters[layers[1]] + out_filters[layers[2]] + \
+                                   out_filters[layers[3]]
+                    prev_stride = out_strides[layers[0]]
+                else:
+                    print("route error!!!")
+
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(EmptyModule())
+            elif block['type'] == 'shortcut':
+                ind = len(models)
+                prev_filters = out_filters[ind - 1]
+                out_filters.append(prev_filters)
+                prev_stride = out_strides[ind - 1]
+                out_strides.append(prev_stride)
+                models.append(EmptyModule())
+            elif block['type'] == 'connected':
+                filters = int(block['output'])
+                if block['activation'] == 'linear':
+                    model = nn.Linear(prev_filters, filters)
+                elif block['activation'] == 'leaky':
+                    model = nn.Sequential(
+                        nn.Linear(prev_filters, filters),
+                        nn.LeakyReLU(0.1, inplace=True))
+                elif block['activation'] == 'relu':
+                    model = nn.Sequential(
+                        nn.Linear(prev_filters, filters),
+                        nn.ReLU(inplace=True))
+                prev_filters = filters
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(model)
+            elif block['type'] == 'region':
+                loss = RegionLoss()
+                anchors = block['anchors'].split(',')
+                loss.anchors = [float(i) for i in anchors]
+                loss.num_classes = int(block['classes'])
+                loss.num_anchors = int(block['num'])
+                loss.anchor_step = len(loss.anchors) // loss.num_anchors
+                loss.object_scale = float(block['object_scale'])
+                loss.noobject_scale = float(block['noobject_scale'])
+                loss.class_scale = float(block['class_scale'])
+                loss.coord_scale = float(block['coord_scale'])
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(loss)
+            elif block['type'] == 'yolo':
+                yolo_layer = YoloLayer()
+                anchors = block['anchors'].split(',')
+                anchor_mask = block['mask'].split(',')
+                yolo_layer.anchor_mask = [int(i) for i in anchor_mask]
+                yolo_layer.anchors = [float(i) for i in anchors]
+                yolo_layer.num_classes = int(block['classes'])
+                self.num_classes = yolo_layer.num_classes
+                yolo_layer.num_anchors = int(block['num'])
+                yolo_layer.anchor_step = len(yolo_layer.anchors) // yolo_layer.num_anchors
+                yolo_layer.stride = prev_stride
+                yolo_layer.scale_x_y = float(block['scale_x_y'])
+                # yolo_layer.object_scale = float(block['object_scale'])
+                # yolo_layer.noobject_scale = float(block['noobject_scale'])
+                # yolo_layer.class_scale = float(block['class_scale'])
+                # yolo_layer.coord_scale = float(block['coord_scale'])
+                out_filters.append(prev_filters)
+                out_strides.append(prev_stride)
+                models.append(yolo_layer)
+            else:
+                print('unknown type %s' % (block['type']))
+
+        return models
+
+    def load_weights(self, weightfile):
+        fp = open(weightfile, 'rb')
+        header = np.fromfile(fp, count=5, dtype=np.int32)
+        self.header = torch.from_numpy(header)
+        self.seen = self.header[3]
+        buf = np.fromfile(fp, dtype=np.float32)
+        fp.close()
+
+        start = 0
+        ind = -2
+        for block in self.blocks:
+            if start >= buf.size:
+                break
+            ind = ind + 1
+            if block['type'] == 'net':
+                continue
+            elif block['type'] == 'convolutional':
+                model = self.models[ind]
+                batch_normalize = int(block['batch_normalize'])
+                if batch_normalize:
+                    start = load_conv_bn(buf, start, model[0], model[1])
+                else:
+                    start = load_conv(buf, start, model[0])
+            elif block['type'] == 'connected':
+                model = self.models[ind]
+                if block['activation'] != 'linear':
+                    start = load_fc(buf, start, model[0])
+                else:
+                    start = load_fc(buf, start, model)
+            elif block['type'] == 'maxpool':
+                pass
+            elif block['type'] == 'reorg':
+                pass
+            elif block['type'] == 'upsample':
+                pass
+            elif block['type'] == 'route':
+                pass
+            elif block['type'] == 'shortcut':
+                pass
+            elif block['type'] == 'region':
+                pass
+            elif block['type'] == 'yolo':
+                pass
+            elif block['type'] == 'avgpool':
+                pass
+            elif block['type'] == 'softmax':
+                pass
+            elif block['type'] == 'cost':
+                pass
+            else:
+                print('unknown type %s' % (block['type']))
+
+    # def save_weights(self, outfile, cutoff=0):
+    #     if cutoff <= 0:
+    #         cutoff = len(self.blocks) - 1
+    #
+    #     fp = open(outfile, 'wb')
+    #     self.header[3] = self.seen
+    #     header = self.header
+    #     header.numpy().tofile(fp)
+    #
+    #     ind = -1
+    #     for blockId in range(1, cutoff + 1):
+    #         ind = ind + 1
+    #         block = self.blocks[blockId]
+    #         if block['type'] == 'convolutional':
+    #             model = self.models[ind]
+    #             batch_normalize = int(block['batch_normalize'])
+    #             if batch_normalize:
+    #                 save_conv_bn(fp, model[0], model[1])
+    #             else:
+    #                 save_conv(fp, model[0])
+    #         elif block['type'] == 'connected':
+    #             model = self.models[ind]
+    #             if block['activation'] != 'linear':
+    #                 save_fc(fc, model)
+    #             else:
+    #                 save_fc(fc, model[0])
+    #         elif block['type'] == 'maxpool':
+    #             pass
+    #         elif block['type'] == 'reorg':
+    #             pass
+    #         elif block['type'] == 'upsample':
+    #             pass
+    #         elif block['type'] == 'route':
+    #             pass
+    #         elif block['type'] == 'shortcut':
+    #             pass
+    #         elif block['type'] == 'region':
+    #             pass
+    #         elif block['type'] == 'yolo':
+    #             pass
+    #         elif block['type'] == 'avgpool':
+    #             pass
+    #         elif block['type'] == 'softmax':
+    #             pass
+    #         elif block['type'] == 'cost':
+    #             pass
+    #         else:
+    #             print('unknown type %s' % (block['type']))
+    #     fp.close()
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/onnx2tensorflow.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/onnx2tensorflow.py
new file mode 100644
index 000000000..d79628b07
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/onnx2tensorflow.py
@@ -0,0 +1,29 @@
+import sys
+import onnx
+from onnx_tf.backend import prepare
+
+
+# tensorflow >=2.0
+# 1: Thanks:github:https://github.com/onnx/onnx-tensorflow
+# 2: Run git clone https://github.com/onnx/onnx-tensorflow.git && cd onnx-tensorflow
+#    Run pip install -e .
+# Note:
+#    Errors will occur when using "pip install onnx-tf", at least for me,
+#    it is recommended to use source code installation
+def transform_to_tensorflow(onnx_input_path, pb_output_path):
+    onnx_model = onnx.load(onnx_input_path)  # load onnx model
+    tf_exp = prepare(onnx_model)  # prepare tf representation
+    tf_exp.export_graph(pb_output_path)  # export the model
+
+
+if __name__ == '__main__':
+    if len(sys.argv) == 1:
+        sys.argv.append('../weight/yolov4_1_3_608_608.onnx')  # use:darknet2onnx.py
+        sys.argv.append('../weight/yolov4.pb')  # use:onnx2tensorflow.py
+    if len(sys.argv) == 3:
+        onnxfile = sys.argv[1]
+        tfpb_outfile = sys.argv[2]
+        transform_to_tensorflow(onnxfile, tfpb_outfile)
+    else:
+        print('Please execute this script this way:\n')
+        print('  python onnx2tensorflow.py <onnxfile> <tfpboutfile>')
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/region_loss.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/region_loss.py
new file mode 100644
index 000000000..28103365a
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/region_loss.py
@@ -0,0 +1,195 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from tool.torch_utils import *
+
+
+def build_targets(pred_boxes, target, anchors, num_anchors, num_classes, nH, nW, noobject_scale, object_scale,
+                  sil_thresh, seen):
+    nB = target.size(0)
+    nA = num_anchors
+    nC = num_classes
+    anchor_step = len(anchors) / num_anchors
+    conf_mask = torch.ones(nB, nA, nH, nW) * noobject_scale
+    coord_mask = torch.zeros(nB, nA, nH, nW)
+    cls_mask = torch.zeros(nB, nA, nH, nW)
+    tx = torch.zeros(nB, nA, nH, nW)
+    ty = torch.zeros(nB, nA, nH, nW)
+    tw = torch.zeros(nB, nA, nH, nW)
+    th = torch.zeros(nB, nA, nH, nW)
+    tconf = torch.zeros(nB, nA, nH, nW)
+    tcls = torch.zeros(nB, nA, nH, nW)
+
+    nAnchors = nA * nH * nW
+    nPixels = nH * nW
+    for b in range(nB):
+        cur_pred_boxes = pred_boxes[b * nAnchors:(b + 1) * nAnchors].t()
+        cur_ious = torch.zeros(nAnchors)
+        for t in range(50):
+            if target[b][t * 5 + 1] == 0:
+                break
+            gx = target[b][t * 5 + 1] * nW
+            gy = target[b][t * 5 + 2] * nH
+            gw = target[b][t * 5 + 3] * nW
+            gh = target[b][t * 5 + 4] * nH
+            cur_gt_boxes = torch.FloatTensor([gx, gy, gw, gh]).repeat(nAnchors, 1).t()
+            cur_ious = torch.max(cur_ious, bbox_ious(cur_pred_boxes, cur_gt_boxes, x1y1x2y2=False))
+        conf_mask[b][cur_ious > sil_thresh] = 0
+    if seen < 12800:
+        if anchor_step == 4:
+            tx = torch.FloatTensor(anchors).view(nA, anchor_step).index_select(1, torch.LongTensor([2])).view(1, nA, 1,
+                                                                                                              1).repeat(
+                nB, 1, nH, nW)
+            ty = torch.FloatTensor(anchors).view(num_anchors, anchor_step).index_select(1, torch.LongTensor([2])).view(
+                1, nA, 1, 1).repeat(nB, 1, nH, nW)
+        else:
+            tx.fill_(0.5)
+            ty.fill_(0.5)
+        tw.zero_()
+        th.zero_()
+        coord_mask.fill_(1)
+
+    nGT = 0
+    nCorrect = 0
+    for b in range(nB):
+        for t in range(50):
+            if target[b][t * 5 + 1] == 0:
+                break
+            nGT = nGT + 1
+            best_iou = 0.0
+            best_n = -1
+            min_dist = 10000
+            gx = target[b][t * 5 + 1] * nW
+            gy = target[b][t * 5 + 2] * nH
+            gi = int(gx)
+            gj = int(gy)
+            gw = target[b][t * 5 + 3] * nW
+            gh = target[b][t * 5 + 4] * nH
+            gt_box = [0, 0, gw, gh]
+            for n in range(nA):
+                aw = anchors[anchor_step * n]
+                ah = anchors[anchor_step * n + 1]
+                anchor_box = [0, 0, aw, ah]
+                iou = bbox_iou(anchor_box, gt_box, x1y1x2y2=False)
+                if anchor_step == 4:
+                    ax = anchors[anchor_step * n + 2]
+                    ay = anchors[anchor_step * n + 3]
+                    dist = pow(((gi + ax) - gx), 2) + pow(((gj + ay) - gy), 2)
+                if iou > best_iou:
+                    best_iou = iou
+                    best_n = n
+                elif anchor_step == 4 and iou == best_iou and dist < min_dist:
+                    best_iou = iou
+                    best_n = n
+                    min_dist = dist
+
+            gt_box = [gx, gy, gw, gh]
+            pred_box = pred_boxes[b * nAnchors + best_n * nPixels + gj * nW + gi]
+
+            coord_mask[b][best_n][gj][gi] = 1
+            cls_mask[b][best_n][gj][gi] = 1
+            conf_mask[b][best_n][gj][gi] = object_scale
+            tx[b][best_n][gj][gi] = target[b][t * 5 + 1] * nW - gi
+            ty[b][best_n][gj][gi] = target[b][t * 5 + 2] * nH - gj
+            tw[b][best_n][gj][gi] = math.log(gw / anchors[anchor_step * best_n])
+            th[b][best_n][gj][gi] = math.log(gh / anchors[anchor_step * best_n + 1])
+            iou = bbox_iou(gt_box, pred_box, x1y1x2y2=False)  # best_iou
+            tconf[b][best_n][gj][gi] = iou
+            tcls[b][best_n][gj][gi] = target[b][t * 5]
+            if iou > 0.5:
+                nCorrect = nCorrect + 1
+
+    return nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls
+
+
+class RegionLoss(nn.Module):
+    def __init__(self, num_classes=0, anchors=[], num_anchors=1):
+        super(RegionLoss, self).__init__()
+        self.num_classes = num_classes
+        self.anchors = anchors
+        self.num_anchors = num_anchors
+        self.anchor_step = len(anchors) / num_anchors
+        self.coord_scale = 1
+        self.noobject_scale = 1
+        self.object_scale = 5
+        self.class_scale = 1
+        self.thresh = 0.6
+        self.seen = 0
+
+    def forward(self, output, target):
+        # output : BxAs*(4+1+num_classes)*H*W
+        t0 = time.time()
+        nB = output.data.size(0)
+        nA = self.num_anchors
+        nC = self.num_classes
+        nH = output.data.size(2)
+        nW = output.data.size(3)
+
+        output = output.view(nB, nA, (5 + nC), nH, nW)
+        x = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([0]))).view(nB, nA, nH, nW))
+        y = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([1]))).view(nB, nA, nH, nW))
+        w = output.index_select(2, Variable(torch.cuda.LongTensor([2]))).view(nB, nA, nH, nW)
+        h = output.index_select(2, Variable(torch.cuda.LongTensor([3]))).view(nB, nA, nH, nW)
+        conf = F.sigmoid(output.index_select(2, Variable(torch.cuda.LongTensor([4]))).view(nB, nA, nH, nW))
+        cls = output.index_select(2, Variable(torch.linspace(5, 5 + nC - 1, nC).long().cuda()))
+        cls = cls.view(nB * nA, nC, nH * nW).transpose(1, 2).contiguous().view(nB * nA * nH * nW, nC)
+        t1 = time.time()
+
+        pred_boxes = torch.cuda.FloatTensor(4, nB * nA * nH * nW)
+        grid_x = torch.linspace(0, nW - 1, nW).repeat(nH, 1).repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
+        grid_y = torch.linspace(0, nH - 1, nH).repeat(nW, 1).t().repeat(nB * nA, 1, 1).view(nB * nA * nH * nW).cuda()
+        anchor_w = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([0])).cuda()
+        anchor_h = torch.Tensor(self.anchors).view(nA, self.anchor_step).index_select(1, torch.LongTensor([1])).cuda()
+        anchor_w = anchor_w.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
+        anchor_h = anchor_h.repeat(nB, 1).repeat(1, 1, nH * nW).view(nB * nA * nH * nW)
+        pred_boxes[0] = x.data + grid_x
+        pred_boxes[1] = y.data + grid_y
+        pred_boxes[2] = torch.exp(w.data) * anchor_w
+        pred_boxes[3] = torch.exp(h.data) * anchor_h
+        pred_boxes = convert2cpu(pred_boxes.transpose(0, 1).contiguous().view(-1, 4))
+        t2 = time.time()
+
+        nGT, nCorrect, coord_mask, conf_mask, cls_mask, tx, ty, tw, th, tconf, tcls = build_targets(pred_boxes,
+                                                                                                    target.data,
+                                                                                                    self.anchors, nA,
+                                                                                                    nC, \
+                                                                                                    nH, nW,
+                                                                                                    self.noobject_scale,
+                                                                                                    self.object_scale,
+                                                                                                    self.thresh,
+                                                                                                    self.seen)
+        cls_mask = (cls_mask == 1)
+        nProposals = int((conf > 0.25).sum().data[0])
+
+        tx = Variable(tx.cuda())
+        ty = Variable(ty.cuda())
+        tw = Variable(tw.cuda())
+        th = Variable(th.cuda())
+        tconf = Variable(tconf.cuda())
+        tcls = Variable(tcls.view(-1)[cls_mask].long().cuda())
+
+        coord_mask = Variable(coord_mask.cuda())
+        conf_mask = Variable(conf_mask.cuda().sqrt())
+        cls_mask = Variable(cls_mask.view(-1, 1).repeat(1, nC).cuda())
+        cls = cls[cls_mask].view(-1, nC)
+
+        t3 = time.time()
+
+        loss_x = self.coord_scale * nn.MSELoss(reduction='sum')(x * coord_mask, tx * coord_mask) / 2.0
+        loss_y = self.coord_scale * nn.MSELoss(reduction='sum')(y * coord_mask, ty * coord_mask) / 2.0
+        loss_w = self.coord_scale * nn.MSELoss(reduction='sum')(w * coord_mask, tw * coord_mask) / 2.0
+        loss_h = self.coord_scale * nn.MSELoss(reduction='sum')(h * coord_mask, th * coord_mask) / 2.0
+        loss_conf = nn.MSELoss(reduction='sum')(conf * conf_mask, tconf * conf_mask) / 2.0
+        loss_cls = self.class_scale * nn.CrossEntropyLoss(reduction='sum')(cls, tcls)
+        loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls
+        t4 = time.time()
+        if False:
+            print('-----------------------------------')
+            print('        activation : %f' % (t1 - t0))
+            print(' create pred_boxes : %f' % (t2 - t1))
+            print('     build targets : %f' % (t3 - t2))
+            print('       create loss : %f' % (t4 - t3))
+            print('             total : %f' % (t4 - t0))
+        print('%d: nGT %d, recall %d, proposals %d, loss: x %f, y %f, w %f, h %f, conf %f, cls %f, total %f' % (
+        self.seen, nGT, nCorrect, nProposals, loss_x.data[0], loss_y.data[0], loss_w.data[0], loss_h.data[0],
+        loss_conf.data[0], loss_cls.data[0], loss.data[0]))
+        return loss
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/torch_utils.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/torch_utils.py
new file mode 100644
index 000000000..61ba33456
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/torch_utils.py
@@ -0,0 +1,133 @@
+import sys
+import os
+import time
+import math
+import torch
+import numpy as np
+from torch.autograd import Variable
+
+import itertools
+import struct  # get_image_size
+import imghdr  # get_image_size
+
+from tool import utils 
+
+def bbox_ious(boxes1, boxes2, x1y1x2y2=True):
+    if x1y1x2y2:
+        mx = torch.min(boxes1[0], boxes2[0])
+        Mx = torch.max(boxes1[2], boxes2[2])
+        my = torch.min(boxes1[1], boxes2[1])
+        My = torch.max(boxes1[3], boxes2[3])
+        w1 = boxes1[2] - boxes1[0]
+        h1 = boxes1[3] - boxes1[1]
+        w2 = boxes2[2] - boxes2[0]
+        h2 = boxes2[3] - boxes2[1]
+    else:
+        mx = torch.min(boxes1[0] - boxes1[2] / 2.0, boxes2[0] - boxes2[2] / 2.0)
+        Mx = torch.max(boxes1[0] + boxes1[2] / 2.0, boxes2[0] + boxes2[2] / 2.0)
+        my = torch.min(boxes1[1] - boxes1[3] / 2.0, boxes2[1] - boxes2[3] / 2.0)
+        My = torch.max(boxes1[1] + boxes1[3] / 2.0, boxes2[1] + boxes2[3] / 2.0)
+        w1 = boxes1[2]
+        h1 = boxes1[3]
+        w2 = boxes2[2]
+        h2 = boxes2[3]
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    mask = ((cw <= 0) + (ch <= 0) > 0)
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    carea[mask] = 0
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+def get_region_boxes(boxes_and_confs):
+
+    # print('Getting boxes from boxes and confs ...')
+
+    boxes_list = []
+    confs_list = []
+
+    for item in boxes_and_confs:
+        boxes_list.append(item[0])
+        confs_list.append(item[1])
+
+    # boxes: [batch, num1 + num2 + num3, 1, 4]
+    # confs: [batch, num1 + num2 + num3, num_classes]
+    boxes = torch.cat(boxes_list, dim=1)
+    confs = torch.cat(confs_list, dim=1)
+        
+    return [boxes, confs]
+
+
+def convert2cpu(gpu_matrix):
+    return torch.FloatTensor(gpu_matrix.size()).copy_(gpu_matrix)
+
+
+def convert2cpu_long(gpu_matrix):
+    return torch.LongTensor(gpu_matrix.size()).copy_(gpu_matrix)
+
+
+
+def do_detect(model, img, conf_thresh, nms_thresh, iteration=0, dummy=0, batch_size=1, width=0, height=0, data_type="", benchmark=0):
+    model.eval()
+    t0 = time.time()
+    if dummy == 0:
+        if type(img) == np.ndarray and len(img.shape) == 3:  # cv2 image
+            img = torch.from_numpy(img.transpose(2, 0, 1)).float().div(255.0).unsqueeze(0)
+        elif type(img) == np.ndarray and len(img.shape) == 4:
+            img = torch.from_numpy(img.transpose(0, 3, 1, 2)).float().div(255.0)
+        else:
+            print("unknow image type")
+            exit(-1)
+    else:
+        img = torch.randn(batch_size, 3, width, height, requires_grad=True)
+
+    profiling = os.environ.get("PROFILE", "OFF").upper() in ["1", "Y", "ON", "YES", "TRUE"]
+    if benchmark == 1:
+        img = img.to("xpu")
+    with torch.autograd.profiler_legacy.profile(enabled=profiling, use_xpu=True, record_shapes=False) as prof:
+        t1 = time.time()
+        if benchmark == 0:
+            img = img.to("xpu")
+        if data_type == "fp16":
+            img = img.half()
+        img = torch.autograd.Variable(img)
+        output = model(img)
+
+        # sync for time measurement
+        torch.xpu.synchronize()
+        if benchmark == 1:
+            inf_latency = time.time() - t1
+    ret = None
+    if benchmark == 0:
+        output = [x.to("cpu") for x in output]
+        ret = utils.post_processing(img, conf_thresh, nms_thresh, output)
+        inf_latency = time.time() - t1
+
+    process_latency = t1 - t0
+    if iteration > 0:
+        print('-----------------------------------')
+        print('           Preprocess : %f' % process_latency)
+        print('      Model Inference : %f' % inf_latency)
+        print('-----------------------------------')
+
+    if profiling:
+        title = "/yolov4_inference_"
+        title += data_type + "_"
+        title += "bs" + str(batch_size) + "_"
+
+        profiling_path = os.getenv('PROFILE_PATH')
+        if not profiling_path:
+            profiling_path = './'
+        torch.save(prof.key_averages().table(sort_by="self_xpu_time_total"), profiling_path + title + 'profiling.pt')
+        torch.save(prof.key_averages(group_by_input_shape=True).table(), profiling_path + title + 'profiling_detailed.pt')
+        prof.export_chrome_trace(profiling_path + title + 'profiling.json')
+        print(prof.key_averages().table(sort_by="self_xpu_time_total"))
+        print(prof.key_averages(group_by_input_shape=True).table())
+
+    return ret, inf_latency
+
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/README.md b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/README.md
new file mode 100644
index 000000000..280d2b26f
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/README.md
@@ -0,0 +1,45 @@
+# Object detection reference training scripts
+
+This folder contains reference training scripts for object detection.
+They serve as a log of how to train specific models, to provide baseline
+training and evaluation scripts to quickly bootstrap research.
+
+To execute the example commands below you must install the following:
+
+```
+cython
+pycocotools
+matplotlib
+```
+
+You must modify the following flags:
+
+`--data-path=/path/to/coco/dataset`
+
+`--nproc_per_node=<number_of_gpus_available>`
+
+Except otherwise noted, all models have been trained on 8x V100 GPUs. 
+
+### Faster R-CNN
+```
+python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
+    --dataset coco --model fasterrcnn_resnet50_fpn --epochs 26\
+    --lr-steps 16 22 --aspect-ratio-group-factor 3
+```
+
+
+### Mask R-CNN
+```
+python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
+    --dataset coco --model maskrcnn_resnet50_fpn --epochs 26\
+    --lr-steps 16 22 --aspect-ratio-group-factor 3
+```
+
+
+### Keypoint R-CNN
+```
+python -m torch.distributed.launch --nproc_per_node=8 --use_env train.py\
+    --dataset coco_kp --model keypointrcnn_resnet50_fpn --epochs 46\
+    --lr-steps 36 43 --aspect-ratio-group-factor 3
+```
+
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/coco_eval.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/coco_eval.py
new file mode 100644
index 000000000..c3f515e32
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/coco_eval.py
@@ -0,0 +1,359 @@
+import json
+import tempfile
+
+import numpy as np
+import copy
+import time
+import torch
+import torch._six
+
+from pycocotools.cocoeval import COCOeval
+from pycocotools.coco import COCO
+import pycocotools.mask as mask_util
+
+from collections import defaultdict
+
+from . import utils
+
+
+class CocoEvaluator(object):
+    def __init__(self, coco_gt, iou_types, bbox_fmt='coco'):
+        assert isinstance(iou_types, (list, tuple))
+        coco_gt = copy.deepcopy(coco_gt)
+        self.coco_gt = coco_gt
+        self.bbox_fmt = bbox_fmt.lower()
+        assert self.bbox_fmt in ['voc', 'coco', 'yolo']
+
+        self.iou_types = iou_types
+        self.coco_eval = {}
+        for iou_type in iou_types:
+            self.coco_eval[iou_type] = COCOeval(coco_gt, iouType=iou_type)
+
+        self.img_ids = []
+        self.eval_imgs = {k: [] for k in iou_types}
+
+    def update(self, predictions):
+        img_ids = list(np.unique(list(predictions.keys())))
+        self.img_ids.extend(img_ids)
+
+        for iou_type in self.iou_types:
+            results = self.prepare(predictions, iou_type)
+            coco_dt = loadRes(self.coco_gt, results) if results else COCO()
+            coco_eval = self.coco_eval[iou_type]
+
+            coco_eval.cocoDt = coco_dt
+            coco_eval.params.imgIds = list(img_ids)
+            img_ids, eval_imgs = evaluate(coco_eval)
+
+            self.eval_imgs[iou_type].append(eval_imgs)
+
+    def synchronize_between_processes(self):
+        for iou_type in self.iou_types:
+            self.eval_imgs[iou_type] = np.concatenate(self.eval_imgs[iou_type], 2)
+            create_common_coco_eval(self.coco_eval[iou_type], self.img_ids, self.eval_imgs[iou_type])
+
+    def accumulate(self):
+        for coco_eval in self.coco_eval.values():
+            coco_eval.accumulate()
+
+    def summarize(self):
+        for iou_type, coco_eval in self.coco_eval.items():
+            print("IoU metric: {}".format(iou_type))
+            coco_eval.summarize()
+
+    def prepare(self, predictions, iou_type):
+        if iou_type == "bbox":
+            return self.prepare_for_coco_detection(predictions)
+        elif iou_type == "segm":
+            return self.prepare_for_coco_segmentation(predictions)
+        elif iou_type == "keypoints":
+            return self.prepare_for_coco_keypoint(predictions)
+        else:
+            raise ValueError("Unknown iou type {}".format(iou_type))
+
+    def prepare_for_coco_detection(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+            
+            if self.bbox_fmt == 'coco':
+                boxes = prediction["boxes"].tolist()
+            else:
+                boxes = prediction["boxes"]
+                boxes = convert_to_xywh(boxes, fmt=self.bbox_fmt).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "bbox": box,
+                        "score": scores[k],
+                    }
+                    for k, box in enumerate(boxes)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_segmentation(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            scores = prediction["scores"]
+            labels = prediction["labels"]
+            masks = prediction["masks"]
+
+            masks = masks > 0.5
+
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+
+            rles = [
+                mask_util.encode(np.array(mask[0, :, :, np.newaxis], dtype=np.uint8, order="F"))[0]
+                for mask in masks
+            ]
+            for rle in rles:
+                rle["counts"] = rle["counts"].decode("utf-8")
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        "segmentation": rle,
+                        "score": scores[k],
+                    }
+                    for k, rle in enumerate(rles)
+                ]
+            )
+        return coco_results
+
+    def prepare_for_coco_keypoint(self, predictions):
+        coco_results = []
+        for original_id, prediction in predictions.items():
+            if len(prediction) == 0:
+                continue
+
+            # boxes = prediction["boxes"]
+            # boxes = convert_to_xywh(boxes).tolist()
+            scores = prediction["scores"].tolist()
+            labels = prediction["labels"].tolist()
+            keypoints = prediction["keypoints"]
+            keypoints = keypoints.flatten(start_dim=1).tolist()
+
+            coco_results.extend(
+                [
+                    {
+                        "image_id": original_id,
+                        "category_id": labels[k],
+                        'keypoints': keypoint,
+                        "score": scores[k],
+                    }
+                    for k, keypoint in enumerate(keypoints)
+                ]
+            )
+        return coco_results
+
+
+def convert_to_xywh(boxes, fmt='voc'):
+    if fmt.lower() == 'voc':
+        xmin, ymin, xmax, ymax = boxes.unbind(1)
+        return torch.stack((xmin, ymin, xmax - xmin, ymax - ymin), dim=1)
+    elif fmt.lower() == 'yolo':
+        xcen, ycen, w, h = boxes.unbind(1)
+        return torch.stack((xcen-w/2, ycen-h/2, w, h), dim=1)
+
+
+def merge(img_ids, eval_imgs):
+    all_img_ids = utils.all_gather(img_ids)
+    all_eval_imgs = utils.all_gather(eval_imgs)
+
+    merged_img_ids = []
+    for p in all_img_ids:
+        merged_img_ids.extend(p)
+
+    merged_eval_imgs = []
+    for p in all_eval_imgs:
+        merged_eval_imgs.append(p)
+
+    merged_img_ids = np.array(merged_img_ids)
+    merged_eval_imgs = np.concatenate(merged_eval_imgs, 2)
+
+    # keep only unique (and in sorted order) images
+    merged_img_ids, idx = np.unique(merged_img_ids, return_index=True)
+    merged_eval_imgs = merged_eval_imgs[..., idx]
+
+    return merged_img_ids, merged_eval_imgs
+
+
+def create_common_coco_eval(coco_eval, img_ids, eval_imgs):
+    img_ids, eval_imgs = merge(img_ids, eval_imgs)
+    img_ids = list(img_ids)
+    eval_imgs = list(eval_imgs.flatten())
+
+    coco_eval.evalImgs = eval_imgs
+    coco_eval.params.imgIds = img_ids
+    coco_eval._paramsEval = copy.deepcopy(coco_eval.params)
+
+
+#################################################################
+# From pycocotools, just removed the prints and fixed
+# a Python3 bug about unicode not defined
+#################################################################
+
+# Ideally, pycocotools wouldn't have hard-coded prints
+# so that we could avoid copy-pasting those two functions
+
+def createIndex(self):
+    # create index
+    # print('creating index...')
+    anns, cats, imgs = {}, {}, {}
+    imgToAnns, catToImgs = defaultdict(list), defaultdict(list)
+    if 'annotations' in self.dataset:
+        for ann in self.dataset['annotations']:
+            imgToAnns[ann['image_id']].append(ann)
+            anns[ann['id']] = ann
+
+    if 'images' in self.dataset:
+        for img in self.dataset['images']:
+            imgs[img['id']] = img
+
+    if 'categories' in self.dataset:
+        for cat in self.dataset['categories']:
+            cats[cat['id']] = cat
+
+    if 'annotations' in self.dataset and 'categories' in self.dataset:
+        for ann in self.dataset['annotations']:
+            catToImgs[ann['category_id']].append(ann['image_id'])
+
+    # print('index created!')
+
+    # create class members
+    self.anns = anns
+    self.imgToAnns = imgToAnns
+    self.catToImgs = catToImgs
+    self.imgs = imgs
+    self.cats = cats
+
+
+maskUtils = mask_util
+
+
+def loadRes(self, resFile):
+    """
+    Load result file and return a result api object.
+    :param   resFile (str)     : file name of result file
+    :return: res (obj)         : result api object
+    """
+    res = COCO()
+    res.dataset['images'] = [img for img in self.dataset['images']]
+
+    # print('Loading and preparing results...')
+    # tic = time.time()
+    if isinstance(resFile, torch._six.string_classes):
+        anns = json.load(open(resFile))
+    elif type(resFile) == np.ndarray:
+        anns = self.loadNumpyAnnotations(resFile)
+    else:
+        anns = resFile
+    assert type(anns) == list, 'results in not an array of objects'
+    annsImgIds = [ann['image_id'] for ann in anns]
+    assert set(annsImgIds) == (set(annsImgIds) & set(self.getImgIds())), \
+        'Results do not correspond to current coco set'
+    if 'caption' in anns[0]:
+        imgIds = set([img['id'] for img in res.dataset['images']]) & set([ann['image_id'] for ann in anns])
+        res.dataset['images'] = [img for img in res.dataset['images'] if img['id'] in imgIds]
+        for id, ann in enumerate(anns):
+            ann['id'] = id + 1
+    elif 'bbox' in anns[0] and not anns[0]['bbox'] == []:
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for id, ann in enumerate(anns):
+            ann['bbox'] = ann['bbox'][0]
+            bb = ann['bbox']
+            x1, x2, y1, y2 = [bb[0], bb[0] + bb[2], bb[1], bb[1] + bb[3]]
+            if 'segmentation' not in ann:
+                ann['segmentation'] = [[x1, y1, x1, y2, x2, y2, x2, y1]]
+            ann['area'] = bb[2] * bb[3]
+            ann['id'] = id + 1
+            ann['iscrowd'] = 0
+    elif 'segmentation' in anns[0]:
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for id, ann in enumerate(anns):
+            # now only support compressed RLE format as segmentation results
+            ann['area'] = maskUtils.area(ann['segmentation'])
+            if 'bbox' not in ann:
+                ann['bbox'] = maskUtils.toBbox(ann['segmentation'])
+            ann['id'] = id + 1
+            ann['iscrowd'] = 0
+    elif 'keypoints' in anns[0]:
+        res.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+        for id, ann in enumerate(anns):
+            s = ann['keypoints']
+            x = s[0::3]
+            y = s[1::3]
+            x1, x2, y1, y2 = np.min(x), np.max(x), np.min(y), np.max(y)
+            ann['area'] = (x2 - x1) * (y2 - y1)
+            ann['id'] = id + 1
+            ann['bbox'] = [x1, y1, x2 - x1, y2 - y1]
+    # print('DONE (t={:0.2f}s)'.format(time.time()- tic))
+
+    res.dataset['annotations'] = anns
+    createIndex(res)
+    return res
+
+
+def evaluate(self):
+    '''
+    Run per image evaluation on given images and store results (a list of dict) in self.evalImgs
+    :return: None
+    '''
+    # tic = time.time()
+    # print('Running per image evaluation...')
+    p = self.params
+    # add backward compatibility if useSegm is specified in params
+    if p.useSegm is not None:
+        p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
+        print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+    # print('Evaluate annotation type *{}*'.format(p.iouType))
+    p.imgIds = list(np.unique(p.imgIds))
+    if p.useCats:
+        p.catIds = list(np.unique(p.catIds))
+    p.maxDets = sorted(p.maxDets)
+    self.params = p
+
+    self._prepare()
+    # loop through images, area range, max detection number
+    catIds = p.catIds if p.useCats else [-1]
+
+    if p.iouType == 'segm' or p.iouType == 'bbox':
+        computeIoU = self.computeIoU
+    elif p.iouType == 'keypoints':
+        computeIoU = self.computeOks
+    self.ious = {
+        (imgId, catId): computeIoU(imgId, catId)
+        for imgId in p.imgIds
+        for catId in catIds}
+
+    evaluateImg = self.evaluateImg
+    maxDet = p.maxDets[-1]
+    evalImgs = [
+        evaluateImg(imgId, catId, areaRng, maxDet)
+        for catId in catIds
+        for areaRng in p.areaRng
+        for imgId in p.imgIds
+    ]
+    # this is NOT in the pycocotools code, but could be done outside
+    evalImgs = np.asarray(evalImgs).reshape(len(catIds), len(p.areaRng), len(p.imgIds))
+    self._paramsEval = copy.deepcopy(self.params)
+    # toc = time.time()
+    # print('DONE (t={:0.2f}s).'.format(toc-tic))
+    return p.imgIds, evalImgs
+
+#################################################################
+# end of straight copy from pycocotools, just removing the prints
+#################################################################
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/coco_utils.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/coco_utils.py
new file mode 100644
index 000000000..ef31f28cd
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/coco_utils.py
@@ -0,0 +1,263 @@
+import copy
+import os
+from PIL import Image
+
+import torch
+import torch.utils.data
+import torchvision
+
+from pycocotools import mask as coco_mask
+from pycocotools.coco import COCO
+
+from . import transforms as T
+
+
+class FilterAndRemapCocoCategories(object):
+    def __init__(self, categories, remap=True):
+        self.categories = categories
+        self.remap = remap
+
+    def __call__(self, image, target):
+        anno = target["annotations"]
+        anno = [obj for obj in anno if obj["category_id"] in self.categories]
+        if not self.remap:
+            target["annotations"] = anno
+            return image, target
+        anno = copy.deepcopy(anno)
+        for obj in anno:
+            obj["category_id"] = self.categories.index(obj["category_id"])
+        target["annotations"] = anno
+        return image, target
+
+
+def convert_coco_poly_to_mask(segmentations, height, width):
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = torch.as_tensor(mask, dtype=torch.uint8)
+        mask = mask.any(dim=2)
+        masks.append(mask)
+    if masks:
+        masks = torch.stack(masks, dim=0)
+    else:
+        masks = torch.zeros((0, height, width), dtype=torch.uint8)
+    return masks
+
+
+class ConvertCocoPolysToMask(object):
+    def __call__(self, image, target):
+        w, h = image.size
+
+        image_id = target["image_id"]
+        image_id = torch.tensor([image_id])
+
+        anno = target["annotations"]
+
+        anno = [obj for obj in anno if obj['iscrowd'] == 0]
+
+        boxes = [obj["bbox"] for obj in anno]
+        # guard against no boxes via resizing
+        boxes = torch.as_tensor(boxes, dtype=torch.float32).reshape(-1, 4)
+        boxes[:, 2:] += boxes[:, :2]
+        boxes[:, 0::2].clamp_(min=0, max=w)
+        boxes[:, 1::2].clamp_(min=0, max=h)
+
+        classes = [obj["category_id"] for obj in anno]
+        classes = torch.tensor(classes, dtype=torch.int64)
+
+        segmentations = [obj["segmentation"] for obj in anno]
+        masks = convert_coco_poly_to_mask(segmentations, h, w)
+
+        keypoints = None
+        if anno and "keypoints" in anno[0]:
+            keypoints = [obj["keypoints"] for obj in anno]
+            keypoints = torch.as_tensor(keypoints, dtype=torch.float32)
+            num_keypoints = keypoints.shape[0]
+            if num_keypoints:
+                keypoints = keypoints.view(num_keypoints, -1, 3)
+
+        keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+        boxes = boxes[keep]
+        classes = classes[keep]
+        masks = masks[keep]
+        if keypoints is not None:
+            keypoints = keypoints[keep]
+
+        target = {}
+        target["boxes"] = boxes
+        target["labels"] = classes
+        target["masks"] = masks
+        target["image_id"] = image_id
+        if keypoints is not None:
+            target["keypoints"] = keypoints
+
+        # for conversion to coco api
+        area = torch.tensor([obj["area"] for obj in anno])
+        iscrowd = torch.tensor([obj["iscrowd"] for obj in anno])
+        target["area"] = area
+        target["iscrowd"] = iscrowd
+
+        return image, target
+
+
+def _coco_remove_images_without_annotations(dataset, cat_list=None):
+    def _has_only_empty_bbox(anno):
+        return all(any(o <= 1 for o in obj["bbox"][2:]) for obj in anno)
+
+    def _count_visible_keypoints(anno):
+        return sum(sum(1 for v in ann["keypoints"][2::3] if v > 0) for ann in anno)
+
+    min_keypoints_per_image = 10
+
+    def _has_valid_annotation(anno):
+        # if it's empty, there is no annotation
+        if len(anno) == 0:
+            return False
+        # if all boxes have close to zero area, there is no annotation
+        if _has_only_empty_bbox(anno):
+            return False
+        # keypoints task have a slight different critera for considering
+        # if an annotation is valid
+        if "keypoints" not in anno[0]:
+            return True
+        # for keypoint detection tasks, only consider valid images those
+        # containing at least min_keypoints_per_image
+        if _count_visible_keypoints(anno) >= min_keypoints_per_image:
+            return True
+        return False
+
+    assert isinstance(dataset, torchvision.datasets.CocoDetection)
+    ids = []
+    for ds_idx, img_id in enumerate(dataset.ids):
+        ann_ids = dataset.coco.getAnnIds(imgIds=img_id, iscrowd=None)
+        anno = dataset.coco.loadAnns(ann_ids)
+        if cat_list:
+            anno = [obj for obj in anno if obj["category_id"] in cat_list]
+        if _has_valid_annotation(anno):
+            ids.append(ds_idx)
+
+    dataset = torch.utils.data.Subset(dataset, ids)
+    return dataset
+
+
+def convert_to_coco_api(ds, bbox_fmt='voc'):
+    """
+    """
+    print("in function convert_to_coco_api...")
+    coco_ds = COCO()
+    # annotation IDs need to start at 1, not 0, see torchvision issue #1530
+    ann_id = 1
+    dataset = {'images': [], 'categories': [], 'annotations': []}
+    categories = set()
+    for img_idx in range(len(ds)):
+        # find better way to get target
+        # targets = ds.get_annotations(img_idx)
+        img, targets = ds[img_idx]
+        image_id = targets["image_id"].item()
+        img_dict = {}
+        img_dict['id'] = image_id
+        img_dict['height'] = img.shape[-2]
+        img_dict['width'] = img.shape[-1]
+        dataset['images'].append(img_dict)
+        bboxes = targets["boxes"]
+        # to coco format: xmin, ymin, w, h
+        if bbox_fmt.lower() == "voc":  # xmin, ymin, xmax, ymax
+            bboxes[:, 2:] -= bboxes[:, :2]
+        elif bbox_fmt.lower() == "yolo":  # xcen, ycen, w, h
+            bboxes[:, :2] = bboxes[:, :2] - bboxes[:, 2:]/2
+        elif bbox_fmt.lower() == "coco":
+            pass
+        else:
+            raise ValueError(f"bounding box format {bbox_fmt} not supported!")
+        bboxes = bboxes.tolist()
+        labels = targets['labels'].tolist()
+        areas = targets['area'].tolist()
+        iscrowd = targets['iscrowd'].tolist()
+        if 'masks' in targets:
+            masks = targets['masks']
+            # make masks Fortran contiguous for coco_mask
+            masks = masks.permute(0, 2, 1).contiguous().permute(0, 2, 1)
+        if 'keypoints' in targets:
+            keypoints = targets['keypoints']
+            keypoints = keypoints.reshape(keypoints.shape[0], -1).tolist()
+        num_objs = len(bboxes)
+        for i in range(num_objs):
+            ann = {}
+            ann['image_id'] = image_id
+            ann['bbox'] = bboxes[i]
+            ann['category_id'] = labels[i]
+            categories.add(labels[i])
+            ann['area'] = areas[i]
+            ann['iscrowd'] = iscrowd[i]
+            ann['id'] = ann_id
+            if 'masks' in targets:
+                ann["segmentation"] = coco_mask.encode(masks[i].numpy())
+            if 'keypoints' in targets:
+                ann['keypoints'] = keypoints[i]
+                ann['num_keypoints'] = sum(k != 0 for k in keypoints[i][2::3])
+            dataset['annotations'].append(ann)
+            ann_id += 1
+    dataset['categories'] = [{'id': i} for i in sorted(categories)]
+    coco_ds.dataset = dataset
+    coco_ds.createIndex()
+    return coco_ds
+
+
+def get_coco_api_from_dataset(dataset):
+    for _ in range(10):
+        if isinstance(dataset, torchvision.datasets.CocoDetection):
+            break
+        if isinstance(dataset, torch.utils.data.Subset):
+            dataset = dataset.dataset
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return dataset.coco
+    return convert_to_coco_api(dataset)
+
+
+class CocoDetection(torchvision.datasets.CocoDetection):
+    def __init__(self, img_folder, ann_file, transforms):
+        super(CocoDetection, self).__init__(img_folder, ann_file)
+        self._transforms = transforms
+
+    def __getitem__(self, idx):
+        img, target = super(CocoDetection, self).__getitem__(idx)
+        image_id = self.ids[idx]
+        target = dict(image_id=image_id, annotations=target)
+        if self._transforms is not None:
+            img, target = self._transforms(img, target)
+        return img, target
+
+
+def get_coco(root, image_set, transforms, mode='instances'):
+    anno_file_template = "{}_{}2017.json"
+    PATHS = {
+        "train": ("train2017", os.path.join("annotations", anno_file_template.format(mode, "train"))),
+        "val": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val"))),
+        # "train": ("val2017", os.path.join("annotations", anno_file_template.format(mode, "val")))
+    }
+
+    t = [ConvertCocoPolysToMask()]
+
+    if transforms is not None:
+        t.append(transforms)
+    transforms = T.Compose(t)
+
+    img_folder, ann_file = PATHS[image_set]
+    img_folder = os.path.join(root, img_folder)
+    ann_file = os.path.join(root, ann_file)
+
+    dataset = CocoDetection(img_folder, ann_file, transforms=transforms)
+
+    if image_set == "train":
+        dataset = _coco_remove_images_without_annotations(dataset)
+
+    # dataset = torch.utils.data.Subset(dataset, [i for i in range(500)])
+
+    return dataset
+
+
+def get_coco_kp(root, image_set, transforms):
+    return get_coco(root, image_set, transforms, mode="person_keypoints")
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/engine.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/engine.py
new file mode 100644
index 000000000..31d948b29
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/engine.py
@@ -0,0 +1,110 @@
+import math
+import sys
+import time
+import torch
+
+import torchvision.models.detection.mask_rcnn
+
+from .coco_utils import get_coco_api_from_dataset
+from .coco_eval import CocoEvaluator
+from . import utils
+
+
+def train_one_epoch(model, optimizer, data_loader, device, epoch, print_freq):
+    model.train()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    metric_logger.add_meter('lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
+    header = 'Epoch: [{}]'.format(epoch)
+
+    lr_scheduler = None
+    if epoch == 0:
+        warmup_factor = 1. / 1000
+        warmup_iters = min(1000, len(data_loader) - 1)
+
+        lr_scheduler = utils.warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor)
+
+    for images, targets in metric_logger.log_every(data_loader, print_freq, header):
+        images = list(image.to(device) for image in images)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        loss_dict = model(images, targets)
+
+        losses = sum(loss for loss in loss_dict.values())
+
+        # reduce losses over all GPUs for logging purposes
+        loss_dict_reduced = utils.reduce_dict(loss_dict)
+        losses_reduced = sum(loss for loss in loss_dict_reduced.values())
+
+        loss_value = losses_reduced.item()
+
+        if not math.isfinite(loss_value):
+            print("Loss is {}, stopping training".format(loss_value))
+            print(loss_dict_reduced)
+            sys.exit(1)
+
+        optimizer.zero_grad()
+        losses.backward()
+        optimizer.step()
+
+        if lr_scheduler is not None:
+            lr_scheduler.step()
+
+        metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
+        metric_logger.update(lr=optimizer.param_groups[0]["lr"])
+
+    return metric_logger
+
+
+def _get_iou_types(model):
+    model_without_ddp = model
+    if isinstance(model, torch.nn.parallel.DistributedDataParallel):
+        model_without_ddp = model.module
+    iou_types = ["bbox"]
+    if isinstance(model_without_ddp, torchvision.models.detection.MaskRCNN):
+        iou_types.append("segm")
+    if isinstance(model_without_ddp, torchvision.models.detection.KeypointRCNN):
+        iou_types.append("keypoints")
+    return iou_types
+
+
+@torch.no_grad()
+def evaluate(model, data_loader, device):
+    n_threads = torch.get_num_threads()
+    # FIXME remove this and make paste_masks_in_image run on the GPU
+    torch.set_num_threads(1)
+    cpu_device = torch.device("cpu")
+    model.eval()
+    metric_logger = utils.MetricLogger(delimiter="  ")
+    header = 'Test:'
+
+    coco = get_coco_api_from_dataset(data_loader.dataset)
+    iou_types = _get_iou_types(model)
+    coco_evaluator = CocoEvaluator(coco, iou_types)
+
+    for images, targets in metric_logger.log_every(data_loader, 100, header):
+        images = list(img.to(device) for img in images)
+        targets = [{k: v.to(device) for k, v in t.items()} for t in targets]
+
+        torch.cuda.synchronize()
+        model_time = time.time()
+        outputs = model(images)
+
+        outputs = [{k: v.to(cpu_device) for k, v in t.items()} for t in outputs]
+        model_time = time.time() - model_time
+
+        res = {target["image_id"].item(): output for target, output in zip(targets, outputs)}
+        evaluator_time = time.time()
+        coco_evaluator.update(res)
+        evaluator_time = time.time() - evaluator_time
+        metric_logger.update(model_time=model_time, evaluator_time=evaluator_time)
+
+    # gather the stats from all processes
+    metric_logger.synchronize_between_processes()
+    print("Averaged stats:", metric_logger)
+    coco_evaluator.synchronize_between_processes()
+
+    # accumulate predictions from all images
+    coco_evaluator.accumulate()
+    coco_evaluator.summarize()
+    torch.set_num_threads(n_threads)
+    return coco_evaluator
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/group_by_aspect_ratio.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/group_by_aspect_ratio.py
new file mode 100644
index 000000000..517056056
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/group_by_aspect_ratio.py
@@ -0,0 +1,195 @@
+import bisect
+from collections import defaultdict
+import copy
+from itertools import repeat, chain
+import math
+import numpy as np
+
+import torch
+import torch.utils.data
+from torch.utils.data.sampler import BatchSampler, Sampler
+from torch.utils.model_zoo import tqdm
+import torchvision
+
+from PIL import Image
+
+
+def _repeat_to_at_least(iterable, n):
+    repeat_times = math.ceil(n / len(iterable))
+    repeated = chain.from_iterable(repeat(iterable, repeat_times))
+    return list(repeated)
+
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces that the batch only contain elements from the same group.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+    Arguments:
+        sampler (Sampler): Base sampler.
+        group_ids (list[int]): If the sampler produces indices in range [0, N),
+            `group_ids` must be a list of `N` ints which contains the group id of each sample.
+            The group ids must be a continuous set of integers starting from
+            0, i.e. they must be in the range [0, num_groups).
+        batch_size (int): Size of mini-batch.
+    """
+    def __init__(self, sampler, group_ids, batch_size):
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        self.sampler = sampler
+        self.group_ids = group_ids
+        self.batch_size = batch_size
+
+    def __iter__(self):
+        buffer_per_group = defaultdict(list)
+        samples_per_group = defaultdict(list)
+
+        num_batches = 0
+        for idx in self.sampler:
+            group_id = self.group_ids[idx]
+            buffer_per_group[group_id].append(idx)
+            samples_per_group[group_id].append(idx)
+            if len(buffer_per_group[group_id]) == self.batch_size:
+                yield buffer_per_group[group_id]
+                num_batches += 1
+                del buffer_per_group[group_id]
+            assert len(buffer_per_group[group_id]) < self.batch_size
+
+        # now we have run out of elements that satisfy
+        # the group criteria, let's return the remaining
+        # elements so that the size of the sampler is
+        # deterministic
+        expected_num_batches = len(self)
+        num_remaining = expected_num_batches - num_batches
+        if num_remaining > 0:
+            # for the remaining batches, take first the buffers with largest number
+            # of elements
+            for group_id, _ in sorted(buffer_per_group.items(),
+                                      key=lambda x: len(x[1]), reverse=True):
+                remaining = self.batch_size - len(buffer_per_group[group_id])
+                samples_from_group_id = _repeat_to_at_least(samples_per_group[group_id], remaining)
+                buffer_per_group[group_id].extend(samples_from_group_id[:remaining])
+                assert len(buffer_per_group[group_id]) == self.batch_size
+                yield buffer_per_group[group_id]
+                num_remaining -= 1
+                if num_remaining == 0:
+                    break
+        assert num_remaining == 0
+
+    def __len__(self):
+        return len(self.sampler) // self.batch_size
+
+
+def _compute_aspect_ratios_slow(dataset, indices=None):
+    print("Your dataset doesn't support the fast path for "
+          "computing the aspect ratios, so will iterate over "
+          "the full dataset and load every image instead. "
+          "This might take some time...")
+    if indices is None:
+        indices = range(len(dataset))
+
+    class SubsetSampler(Sampler):
+        def __init__(self, indices):
+            self.indices = indices
+
+        def __iter__(self):
+            return iter(self.indices)
+
+        def __len__(self):
+            return len(self.indices)
+
+    sampler = SubsetSampler(indices)
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_size=1, sampler=sampler,
+        num_workers=14,  # you might want to increase it for faster processing
+        collate_fn=lambda x: x[0])
+    aspect_ratios = []
+    with tqdm(total=len(dataset)) as pbar:
+        for _i, (img, _) in enumerate(data_loader):
+            pbar.update(1)
+            height, width = img.shape[-2:]
+            aspect_ratio = float(width) / float(height)
+            aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_custom_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        height, width = dataset.get_height_and_width(i)
+        aspect_ratio = float(width) / float(height)
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_coco_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        img_info = dataset.coco.imgs[dataset.ids[i]]
+        aspect_ratio = float(img_info["width"]) / float(img_info["height"])
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_voc_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+    aspect_ratios = []
+    for i in indices:
+        # this doesn't load the data into memory, because PIL loads it lazily
+        width, height = Image.open(dataset.images[i]).size
+        aspect_ratio = float(width) / float(height)
+        aspect_ratios.append(aspect_ratio)
+    return aspect_ratios
+
+
+def _compute_aspect_ratios_subset_dataset(dataset, indices=None):
+    if indices is None:
+        indices = range(len(dataset))
+
+    ds_indices = [dataset.indices[i] for i in indices]
+    return compute_aspect_ratios(dataset.dataset, ds_indices)
+
+
+def compute_aspect_ratios(dataset, indices=None):
+    if hasattr(dataset, "get_height_and_width"):
+        return _compute_aspect_ratios_custom_dataset(dataset, indices)
+
+    if isinstance(dataset, torchvision.datasets.CocoDetection):
+        return _compute_aspect_ratios_coco_dataset(dataset, indices)
+
+    if isinstance(dataset, torchvision.datasets.VOCDetection):
+        return _compute_aspect_ratios_voc_dataset(dataset, indices)
+
+    if isinstance(dataset, torch.utils.data.Subset):
+        return _compute_aspect_ratios_subset_dataset(dataset, indices)
+
+    # slow path
+    return _compute_aspect_ratios_slow(dataset, indices)
+
+
+def _quantize(x, bins):
+    bins = copy.deepcopy(bins)
+    bins = sorted(bins)
+    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
+    return quantized
+
+
+def create_aspect_ratio_groups(dataset, k=0):
+    aspect_ratios = compute_aspect_ratios(dataset)
+    bins = (2 ** np.linspace(-1, 1, 2 * k + 1)).tolist() if k > 0 else [1.0]
+    groups = _quantize(aspect_ratios, bins)
+    # count number of elements per group
+    counts = np.unique(groups, return_counts=True)[1]
+    fbins = [0] + bins + [np.inf]
+    print("Using {} as bins for aspect ratio quantization".format(fbins))
+    print("Count of instances per bin: {}".format(counts))
+    return groups
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/train.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/train.py
new file mode 100644
index 000000000..6a1b7ed01
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/train.py
@@ -0,0 +1,201 @@
+r"""PyTorch Detection Training.
+
+To run in a multi-gpu environment, use the distributed launcher::
+
+    python -m torch.distributed.launch --nproc_per_node=$NGPU --use_env \
+        train.py ... --world-size $NGPU
+
+The default hyperparameters are tuned for training on 8 gpus and 2 images per gpu.
+    --lr 0.02 --batch-size 2 --world-size 8
+If you use different number of gpus, the learning rate should be changed to 0.02/8*$NGPU.
+
+On top of that, for training Faster/Mask R-CNN, the default hyperparameters are
+    --epochs 26 --lr-steps 16 22 --aspect-ratio-group-factor 3
+
+Also, if you train Keypoint R-CNN, the default hyperparameters are
+    --epochs 46 --lr-steps 36 43 --aspect-ratio-group-factor 3
+Because the number of images is smaller in the person keypoint subset of COCO,
+the number of epochs should be adapted so that we have the same number of iterations.
+"""
+import datetime
+import os
+import time
+
+import torch
+import torch.utils.data
+from torch import nn
+import torchvision
+import torchvision.models.detection
+import torchvision.models.detection.mask_rcnn
+
+from .coco_utils import get_coco, get_coco_kp
+
+from .group_by_aspect_ratio import GroupedBatchSampler, create_aspect_ratio_groups
+from .engine import train_one_epoch, evaluate
+
+from . import utils
+from . import transforms as T
+
+
+def get_dataset(name, image_set, transform, data_path):
+    paths = {
+        "coco": (data_path, get_coco, 91),
+        "coco_kp": (data_path, get_coco_kp, 2)
+    }
+    p, ds_fn, num_classes = paths[name]
+
+    ds = ds_fn(p, image_set=image_set, transforms=transform)
+    return ds, num_classes
+
+
+def get_transform(train):
+    transforms = []
+    transforms.append(T.ToTensor())
+    if train:
+        transforms.append(T.RandomHorizontalFlip(0.5))
+    return T.Compose(transforms)
+
+
+def main(args):
+    utils.init_distributed_mode(args)
+    print(args)
+
+    device = torch.device(args.device)
+
+    # Data loading code
+    print("Loading data")
+
+    dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path)
+    dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path)
+
+    print("Creating data loaders")
+    if args.distributed:
+        train_sampler = torch.utils.data.distributed.DistributedSampler(dataset)
+        test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test)
+    else:
+        train_sampler = torch.utils.data.RandomSampler(dataset)
+        test_sampler = torch.utils.data.SequentialSampler(dataset_test)
+
+    if args.aspect_ratio_group_factor >= 0:
+        group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor)
+        train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size)
+    else:
+        train_batch_sampler = torch.utils.data.BatchSampler(
+            train_sampler, args.batch_size, drop_last=True)
+
+    data_loader = torch.utils.data.DataLoader(
+        dataset, batch_sampler=train_batch_sampler, num_workers=args.workers,
+        collate_fn=utils.collate_fn)
+
+    data_loader_test = torch.utils.data.DataLoader(
+        dataset_test, batch_size=1,
+        sampler=test_sampler, num_workers=args.workers,
+        collate_fn=utils.collate_fn)
+
+    print("Creating model")
+    model = torchvision.models.detection.__dict__[args.model](num_classes=num_classes,
+                                                              pretrained=args.pretrained)
+    model.to(device)
+
+    model_without_ddp = model
+    if args.distributed:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu])
+        model_without_ddp = model.module
+
+    params = [p for p in model.parameters() if p.requires_grad]
+    optimizer = torch.optim.SGD(
+        params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay)
+
+    # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma)
+    lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma)
+
+    if args.resume:
+        checkpoint = torch.load(args.resume, map_location='cpu')
+        model_without_ddp.load_state_dict(checkpoint['model'])
+        optimizer.load_state_dict(checkpoint['optimizer'])
+        lr_scheduler.load_state_dict(checkpoint['lr_scheduler'])
+        args.start_epoch = checkpoint['epoch'] + 1
+
+    if args.test_only:
+        evaluate(model, data_loader_test, device=device)
+        return
+
+    print("Start training")
+    start_time = time.time()
+    for epoch in range(args.start_epoch, args.epochs):
+        if args.distributed:
+            train_sampler.set_epoch(epoch)
+        train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq)
+        lr_scheduler.step()
+        if args.output_dir:
+            utils.save_on_master({
+                'model': model_without_ddp.state_dict(),
+                'optimizer': optimizer.state_dict(),
+                'lr_scheduler': lr_scheduler.state_dict(),
+                'args': args,
+                'epoch': epoch},
+                os.path.join(args.output_dir, 'model_{}.pth'.format(epoch)))
+
+        # evaluate after every epoch
+        evaluate(model, data_loader_test, device=device)
+
+    total_time = time.time() - start_time
+    total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+    print('Training time {}'.format(total_time_str))
+
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(
+        description=__doc__)
+
+    parser.add_argument('--data-path', default='/datasets01/COCO/022719/', help='dataset')
+    parser.add_argument('--dataset', default='coco', help='dataset')
+    parser.add_argument('--model', default='maskrcnn_resnet50_fpn', help='model')
+    parser.add_argument('--device', default='cuda', help='device')
+    parser.add_argument('-b', '--batch-size', default=2, type=int,
+                        help='images per gpu, the total batch size is $NGPU x batch_size')
+    parser.add_argument('--epochs', default=26, type=int, metavar='N',
+                        help='number of total epochs to run')
+    parser.add_argument('-j', '--workers', default=4, type=int, metavar='N',
+                        help='number of data loading workers (default: 4)')
+    parser.add_argument('--lr', default=0.02, type=float,
+                        help='initial learning rate, 0.02 is the default value for training '
+                        'on 8 gpus and 2 images_per_gpu')
+    parser.add_argument('--momentum', default=0.9, type=float, metavar='M',
+                        help='momentum')
+    parser.add_argument('--wd', '--weight-decay', default=1e-4, type=float,
+                        metavar='W', help='weight decay (default: 1e-4)',
+                        dest='weight_decay')
+    parser.add_argument('--lr-step-size', default=8, type=int, help='decrease lr every step-size epochs')
+    parser.add_argument('--lr-steps', default=[16, 22], nargs='+', type=int, help='decrease lr every step-size epochs')
+    parser.add_argument('--lr-gamma', default=0.1, type=float, help='decrease lr by a factor of lr-gamma')
+    parser.add_argument('--print-freq', default=20, type=int, help='print frequency')
+    parser.add_argument('--output-dir', default='.', help='path where to save')
+    parser.add_argument('--resume', default='', help='resume from checkpoint')
+    parser.add_argument('--start_epoch', default=0, type=int, help='start epoch')
+    parser.add_argument('--aspect-ratio-group-factor', default=3, type=int)
+    parser.add_argument(
+        "--test-only",
+        dest="test_only",
+        help="Only test the model",
+        action="store_true",
+    )
+    parser.add_argument(
+        "--pretrained",
+        dest="pretrained",
+        help="Use pre-trained models from the modelzoo",
+        action="store_true",
+    )
+
+    # distributed training parameters
+    parser.add_argument('--world-size', default=1, type=int,
+                        help='number of distributed processes')
+    parser.add_argument('--dist-url', default='env://', help='url used to set up distributed training')
+
+    args = parser.parse_args()
+
+    if args.output_dir:
+        utils.mkdir(args.output_dir)
+
+    main(args)
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/transforms.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/transforms.py
new file mode 100644
index 000000000..73efc92bd
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/transforms.py
@@ -0,0 +1,50 @@
+import random
+import torch
+
+from torchvision.transforms import functional as F
+
+
+def _flip_coco_person_keypoints(kps, width):
+    flip_inds = [0, 2, 1, 4, 3, 6, 5, 8, 7, 10, 9, 12, 11, 14, 13, 16, 15]
+    flipped_data = kps[:, flip_inds]
+    flipped_data[..., 0] = width - flipped_data[..., 0]
+    # Maintain COCO convention that if visibility == 0, then x, y = 0
+    inds = flipped_data[..., 2] == 0
+    flipped_data[inds] = 0
+    return flipped_data
+
+
+class Compose(object):
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+
+class RandomHorizontalFlip(object):
+    def __init__(self, prob):
+        self.prob = prob
+
+    def __call__(self, image, target):
+        if random.random() < self.prob:
+            height, width = image.shape[-2:]
+            image = image.flip(-1)
+            bbox = target["boxes"]
+            bbox[:, [0, 2]] = width - bbox[:, [2, 0]]
+            target["boxes"] = bbox
+            if "masks" in target:
+                target["masks"] = target["masks"].flip(-1)
+            if "keypoints" in target:
+                keypoints = target["keypoints"]
+                keypoints = _flip_coco_person_keypoints(keypoints, width)
+                target["keypoints"] = keypoints
+        return image, target
+
+
+class ToTensor(object):
+    def __call__(self, image, target):
+        image = F.to_tensor(image)
+        return image, target
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/utils.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/utils.py
new file mode 100644
index 000000000..82ae79bc3
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/tv_reference/utils.py
@@ -0,0 +1,324 @@
+from collections import defaultdict, deque
+import datetime
+import pickle
+import time
+
+import torch
+import torch.distributed as dist
+
+import errno
+import os
+
+
+class SmoothedValue(object):
+    """Track a series of values and provide access to smoothed values over a
+    window or the global series average.
+    """
+
+    def __init__(self, window_size=20, fmt=None):
+        if fmt is None:
+            fmt = "{median:.4f} ({global_avg:.4f})"
+        self.deque = deque(maxlen=window_size)
+        self.total = 0.0
+        self.count = 0
+        self.fmt = fmt
+
+    def update(self, value, n=1):
+        self.deque.append(value)
+        self.count += n
+        self.total += value * n
+
+    def synchronize_between_processes(self):
+        """
+        Warning: does not synchronize the deque!
+        """
+        if not is_dist_avail_and_initialized():
+            return
+        t = torch.tensor([self.count, self.total], dtype=torch.float64, device='cuda')
+        dist.barrier()
+        dist.all_reduce(t)
+        t = t.tolist()
+        self.count = int(t[0])
+        self.total = t[1]
+
+    @property
+    def median(self):
+        d = torch.tensor(list(self.deque))
+        return d.median().item()
+
+    @property
+    def avg(self):
+        d = torch.tensor(list(self.deque), dtype=torch.float32)
+        return d.mean().item()
+
+    @property
+    def global_avg(self):
+        return self.total / self.count
+
+    @property
+    def max(self):
+        return max(self.deque)
+
+    @property
+    def value(self):
+        return self.deque[-1]
+
+    def __str__(self):
+        return self.fmt.format(
+            median=self.median,
+            avg=self.avg,
+            global_avg=self.global_avg,
+            max=self.max,
+            value=self.value)
+
+
+def all_gather(data):
+    """
+    Run all_gather on arbitrary picklable data (not necessarily tensors)
+    Args:
+        data: any picklable object
+    Returns:
+        list[data]: list of data gathered from each rank
+    """
+    world_size = get_world_size()
+    if world_size == 1:
+        return [data]
+
+    # serialized to a Tensor
+    buffer = pickle.dumps(data)
+    storage = torch.ByteStorage.from_buffer(buffer)
+    tensor = torch.ByteTensor(storage).to("cuda")
+
+    # obtain Tensor size of each rank
+    local_size = torch.tensor([tensor.numel()], device="cuda")
+    size_list = [torch.tensor([0], device="cuda") for _ in range(world_size)]
+    dist.all_gather(size_list, local_size)
+    size_list = [int(size.item()) for size in size_list]
+    max_size = max(size_list)
+
+    # receiving Tensor from all ranks
+    # we pad the tensor because torch all_gather does not support
+    # gathering tensors of different shapes
+    tensor_list = []
+    for _ in size_list:
+        tensor_list.append(torch.empty((max_size,), dtype=torch.uint8, device="cuda"))
+    if local_size != max_size:
+        padding = torch.empty(size=(max_size - local_size,), dtype=torch.uint8, device="cuda")
+        tensor = torch.cat((tensor, padding), dim=0)
+    dist.all_gather(tensor_list, tensor)
+
+    data_list = []
+    for size, tensor in zip(size_list, tensor_list):
+        buffer = tensor.cpu().numpy().tobytes()[:size]
+        data_list.append(pickle.loads(buffer))
+
+    return data_list
+
+
+def reduce_dict(input_dict, average=True):
+    """
+    Args:
+        input_dict (dict): all the values will be reduced
+        average (bool): whether to do average or sum
+    Reduce the values in the dictionary from all processes so that all processes
+    have the averaged results. Returns a dict with the same fields as
+    input_dict, after reduction.
+    """
+    world_size = get_world_size()
+    if world_size < 2:
+        return input_dict
+    with torch.no_grad():
+        names = []
+        values = []
+        # sort the keys so that they are consistent across processes
+        for k in sorted(input_dict.keys()):
+            names.append(k)
+            values.append(input_dict[k])
+        values = torch.stack(values, dim=0)
+        dist.all_reduce(values)
+        if average:
+            values /= world_size
+        reduced_dict = {k: v for k, v in zip(names, values)}
+    return reduced_dict
+
+
+class MetricLogger(object):
+    def __init__(self, delimiter="\t"):
+        self.meters = defaultdict(SmoothedValue)
+        self.delimiter = delimiter
+
+    def update(self, **kwargs):
+        for k, v in kwargs.items():
+            if isinstance(v, torch.Tensor):
+                v = v.item()
+            assert isinstance(v, (float, int))
+            self.meters[k].update(v)
+
+    def __getattr__(self, attr):
+        if attr in self.meters:
+            return self.meters[attr]
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        raise AttributeError("'{}' object has no attribute '{}'".format(
+            type(self).__name__, attr))
+
+    def __str__(self):
+        loss_str = []
+        for name, meter in self.meters.items():
+            loss_str.append(
+                "{}: {}".format(name, str(meter))
+            )
+        return self.delimiter.join(loss_str)
+
+    def synchronize_between_processes(self):
+        for meter in self.meters.values():
+            meter.synchronize_between_processes()
+
+    def add_meter(self, name, meter):
+        self.meters[name] = meter
+
+    def log_every(self, iterable, print_freq, header=None):
+        i = 0
+        if not header:
+            header = ''
+        start_time = time.time()
+        end = time.time()
+        iter_time = SmoothedValue(fmt='{avg:.4f}')
+        data_time = SmoothedValue(fmt='{avg:.4f}')
+        space_fmt = ':' + str(len(str(len(iterable)))) + 'd'
+        if torch.cuda.is_available():
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}',
+                'max mem: {memory:.0f}'
+            ])
+        else:
+            log_msg = self.delimiter.join([
+                header,
+                '[{0' + space_fmt + '}/{1}]',
+                'eta: {eta}',
+                '{meters}',
+                'time: {time}',
+                'data: {data}'
+            ])
+        MB = 1024.0 * 1024.0
+        for obj in iterable:
+            data_time.update(time.time() - end)
+            yield obj
+            iter_time.update(time.time() - end)
+            if i % print_freq == 0 or i == len(iterable) - 1:
+                eta_seconds = iter_time.global_avg * (len(iterable) - i)
+                eta_string = str(datetime.timedelta(seconds=int(eta_seconds)))
+                if torch.cuda.is_available():
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time),
+                        memory=torch.cuda.max_memory_allocated() / MB))
+                else:
+                    print(log_msg.format(
+                        i, len(iterable), eta=eta_string,
+                        meters=str(self),
+                        time=str(iter_time), data=str(data_time)))
+            i += 1
+            end = time.time()
+        total_time = time.time() - start_time
+        total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+        print('{} Total time: {} ({:.4f} s / it)'.format(
+            header, total_time_str, total_time / len(iterable)))
+
+
+def collate_fn(batch):
+    return tuple(zip(*batch))
+
+
+def warmup_lr_scheduler(optimizer, warmup_iters, warmup_factor):
+
+    def f(x):
+        if x >= warmup_iters:
+            return 1
+        alpha = float(x) / warmup_iters
+        return warmup_factor * (1 - alpha) + alpha
+
+    return torch.optim.lr_scheduler.LambdaLR(optimizer, f)
+
+
+def mkdir(path):
+    try:
+        os.makedirs(path)
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+
+
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+
+    def print(*args, **kwargs):
+        force = kwargs.pop('force', False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+
+    __builtin__.print = print
+
+
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+
+
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+
+
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+
+
+def is_main_process():
+    return get_rank() == 0
+
+
+def save_on_master(*args, **kwargs):
+    if is_main_process():
+        torch.save(*args, **kwargs)
+
+
+def init_distributed_mode(args):
+    if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ['WORLD_SIZE'])
+        args.gpu = int(os.environ['LOCAL_RANK'])
+    elif 'SLURM_PROCID' in os.environ:
+        args.rank = int(os.environ['SLURM_PROCID'])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print('Not using distributed mode')
+        args.distributed = False
+        return
+
+    args.distributed = True
+
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = 'nccl'
+    print('| distributed init (rank {}): {}'.format(
+        args.rank, args.dist_url), flush=True)
+    torch.distributed.init_process_group(backend=args.dist_backend, init_method=args.dist_url,
+                                         world_size=args.world_size, rank=args.rank)
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/utils.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/utils.py
new file mode 100644
index 000000000..6593c35a6
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/utils.py
@@ -0,0 +1,233 @@
+import sys
+import os
+import time
+import math
+import numpy as np
+
+import itertools
+import struct  # get_image_size
+import imghdr  # get_image_size
+
+
+def sigmoid(x):
+    return 1.0 / (np.exp(-x) + 1.)
+
+
+def softmax(x):
+    x = np.exp(x - np.expand_dims(np.max(x, axis=1), axis=1))
+    x = x / np.expand_dims(x.sum(axis=1), axis=1)
+    return x
+
+
+def bbox_iou(box1, box2, x1y1x2y2=True):
+    
+    # print('iou box1:', box1)
+    # print('iou box2:', box2)
+
+    if x1y1x2y2:
+        mx = min(box1[0], box2[0])
+        Mx = max(box1[2], box2[2])
+        my = min(box1[1], box2[1])
+        My = max(box1[3], box2[3])
+        w1 = box1[2] - box1[0]
+        h1 = box1[3] - box1[1]
+        w2 = box2[2] - box2[0]
+        h2 = box2[3] - box2[1]
+    else:
+        w1 = box1[2]
+        h1 = box1[3]
+        w2 = box2[2]
+        h2 = box2[3]
+
+        mx = min(box1[0], box2[0])
+        Mx = max(box1[0] + w1, box2[0] + w2)
+        my = min(box1[1], box2[1])
+        My = max(box1[1] + h1, box2[1] + h2)
+    uw = Mx - mx
+    uh = My - my
+    cw = w1 + w2 - uw
+    ch = h1 + h2 - uh
+    carea = 0
+    if cw <= 0 or ch <= 0:
+        return 0.0
+
+    area1 = w1 * h1
+    area2 = w2 * h2
+    carea = cw * ch
+    uarea = area1 + area2 - carea
+    return carea / uarea
+
+
+def nms_cpu(boxes, confs, nms_thresh=0.5, min_mode=False):
+    # print(boxes.shape)
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+
+    areas = (x2 - x1) * (y2 - y1)
+    order = confs.argsort()[::-1]
+
+    keep = []
+    while order.size > 0:
+        idx_self = order[0]
+        idx_other = order[1:]
+
+        keep.append(idx_self)
+
+        xx1 = np.maximum(x1[idx_self], x1[idx_other])
+        yy1 = np.maximum(y1[idx_self], y1[idx_other])
+        xx2 = np.minimum(x2[idx_self], x2[idx_other])
+        yy2 = np.minimum(y2[idx_self], y2[idx_other])
+
+        w = np.maximum(0.0, xx2 - xx1)
+        h = np.maximum(0.0, yy2 - yy1)
+        inter = w * h
+
+        if min_mode:
+            over = inter / np.minimum(areas[order[0]], areas[order[1:]])
+        else:
+            over = inter / (areas[order[0]] + areas[order[1:]] - inter)
+
+        inds = np.where(over <= nms_thresh)[0]
+        order = order[inds + 1]
+    
+    return np.array(keep)
+
+
+
+def plot_boxes_cv2(img, boxes, savename=None, class_names=None, color=None):
+    import cv2
+    img = np.copy(img)
+    colors = np.array([[1, 0, 1], [0, 0, 1], [0, 1, 1], [0, 1, 0], [1, 1, 0], [1, 0, 0]], dtype=np.float32)
+
+    def get_color(c, x, max_val):
+        ratio = float(x) / max_val * 5
+        i = int(math.floor(ratio))
+        j = int(math.ceil(ratio))
+        ratio = ratio - i
+        r = (1 - ratio) * colors[i][c] + ratio * colors[j][c]
+        return int(r * 255)
+
+    width = img.shape[1]
+    height = img.shape[0]
+    for i in range(len(boxes)):
+        box = boxes[i]
+        x1 = int(box[0] * width)
+        y1 = int(box[1] * height)
+        x2 = int(box[2] * width)
+        y2 = int(box[3] * height)
+
+        if color:
+            rgb = color
+        else:
+            rgb = (255, 0, 0)
+        if len(box) >= 7 and class_names:
+            cls_conf = box[5]
+            cls_id = box[6]
+            print('%s: %f' % (class_names[cls_id], cls_conf))
+            classes = len(class_names)
+            offset = cls_id * 123457 % classes
+            red = get_color(2, offset, classes)
+            green = get_color(1, offset, classes)
+            blue = get_color(0, offset, classes)
+            if color is None:
+                rgb = (red, green, blue)
+            img = cv2.putText(img, class_names[cls_id], (x1, y1), cv2.FONT_HERSHEY_SIMPLEX, 1.2, rgb, 1)
+        img = cv2.rectangle(img, (x1, y1), (x2, y2), rgb, 1)
+    if savename:
+        print("save plot results to %s" % savename)
+        cv2.imwrite(savename, img)
+    return img
+
+
+def read_truths(lab_path):
+    if not os.path.exists(lab_path):
+        return np.array([])
+    if os.path.getsize(lab_path):
+        truths = np.loadtxt(lab_path)
+        truths = truths.reshape(truths.size / 5, 5)  # to avoid single truth problem
+        return truths
+    else:
+        return np.array([])
+
+
+def load_class_names(namesfile):
+    class_names = []
+    with open(namesfile, 'r') as fp:
+        lines = fp.readlines()
+    for line in lines:
+        line = line.rstrip()
+        class_names.append(line)
+    return class_names
+
+
+
+def post_processing(img, conf_thresh, nms_thresh, output):
+
+    # anchors = [12, 16, 19, 36, 40, 28, 36, 75, 76, 55, 72, 146, 142, 110, 192, 243, 459, 401]
+    # num_anchors = 9
+    # anchor_masks = [[0, 1, 2], [3, 4, 5], [6, 7, 8]]
+    # strides = [8, 16, 32]
+    # anchor_step = len(anchors) // num_anchors
+
+    # [batch, num, 1, 4]
+    box_array = output[0]
+    # [batch, num, num_classes]
+    confs = output[1]
+
+    t1 = time.time()
+
+    if type(box_array).__name__ != 'ndarray':
+        box_array = box_array.cpu().detach().numpy()
+        confs = confs.cpu().detach().numpy()
+
+    num_classes = confs.shape[2]
+
+    # [batch, num, 4]
+    box_array = box_array[:, :, 0]
+
+    # [batch, num, num_classes] --> [batch, num]
+    max_conf = np.max(confs, axis=2)
+    max_id = np.argmax(confs, axis=2)
+
+    t2 = time.time()
+
+    bboxes_batch = []
+    for i in range(box_array.shape[0]):
+       
+        argwhere = max_conf[i] > conf_thresh
+        l_box_array = box_array[i, argwhere, :]
+        l_max_conf = max_conf[i, argwhere]
+        l_max_id = max_id[i, argwhere]
+
+        bboxes = []
+        # nms for each class
+        for j in range(num_classes):
+
+            cls_argwhere = l_max_id == j
+            ll_box_array = l_box_array[cls_argwhere, :]
+            ll_max_conf = l_max_conf[cls_argwhere]
+            ll_max_id = l_max_id[cls_argwhere]
+
+            keep = nms_cpu(ll_box_array, ll_max_conf, nms_thresh)
+            
+            if (keep.size > 0):
+                ll_box_array = ll_box_array[keep, :]
+                ll_max_conf = ll_max_conf[keep]
+                ll_max_id = ll_max_id[keep]
+
+                for k in range(ll_box_array.shape[0]):
+                    bboxes.append([ll_box_array[k, 0], ll_box_array[k, 1], ll_box_array[k, 2], ll_box_array[k, 3], ll_max_conf[k], ll_max_conf[k], ll_max_id[k]])
+        
+        bboxes_batch.append(bboxes)
+
+    t3 = time.time()
+
+    print('-----------------------------------')
+    print('       max and argmax : %f' % (t2 - t1))
+    print('                  nms : %f' % (t3 - t2))
+    print('Post processing total : %f' % (t3 - t1))
+    print('-----------------------------------')
+    
+    return bboxes_batch
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/utils_iou.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/utils_iou.py
new file mode 100644
index 000000000..d2d7b9d14
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/utils_iou.py
@@ -0,0 +1,204 @@
+# -*- coding: utf-8 -*-
+'''
+
+'''
+import torch
+import os, sys
+from torch.nn import functional as F
+
+import numpy as np
+from packaging import version
+
+
+__all__ = [
+    "bboxes_iou",
+    "bboxes_giou",
+    "bboxes_diou",
+    "bboxes_ciou",
+]
+
+
+if version.parse(torch.__version__) >= version.parse('1.5.0'):
+    def _true_divide(dividend, divisor):
+        return torch.true_divide(dividend, divisor)
+else:
+    def _true_divide(dividend, divisor):
+        return dividend / divisor
+
+def bboxes_iou(bboxes_a, bboxes_b, fmt='voc', iou_type='iou'):
+    """Calculate the Intersection of Unions (IoUs) between bounding boxes.
+    IoU is calculated as a ratio of area of the intersection
+    and area of the union.
+
+    Args:
+        bbox_a (array): An array whose shape is :math:`(N, 4)`.
+            :math:`N` is the number of bounding boxes.
+            The dtype should be :obj:`numpy.float32`.
+        bbox_b (array): An array similar to :obj:`bbox_a`,
+            whose shape is :math:`(K, 4)`.
+            The dtype should be :obj:`numpy.float32`.
+    Returns:
+        array:
+        An array whose shape is :math:`(N, K)`. \
+        An element at index :math:`(n, k)` contains IoUs between \
+        :math:`n` th bounding box in :obj:`bbox_a` and :math:`k` th bounding \
+        box in :obj:`bbox_b`.
+
+    from: https://github.com/chainer/chainercv
+    """
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    N, K = bboxes_a.shape[0], bboxes_b.shape[0]
+
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        # top left
+        tl_intersect = torch.max(
+            bboxes_a[:, np.newaxis, :2],
+            bboxes_b[:, :2]
+        ) # of shape `(N,K,2)`
+        # bottom right
+        br_intersect = torch.min(
+            bboxes_a[:, np.newaxis, 2:],
+            bboxes_b[:, 2:]
+        )
+        bb_a = bboxes_a[:, 2:] - bboxes_a[:, :2]
+        bb_b = bboxes_b[:, 2:] - bboxes_b[:, :2]
+        # bb_* can also be seen vectors representing box_width, box_height
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        # top left
+        tl_intersect = torch.max(
+            bboxes_a[:, np.newaxis, :2] - bboxes_a[:, np.newaxis, 2:] / 2,
+            bboxes_b[:, :2] - bboxes_b[:, 2:] / 2
+        )
+        # bottom right
+        br_intersect = torch.min(
+            bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:] / 2,
+            bboxes_b[:, :2] + bboxes_b[:, 2:] / 2
+        )
+        bb_a = bboxes_a[:, 2:]
+        bb_b = bboxes_b[:, 2:]
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        # top left
+        tl_intersect = torch.max(
+            bboxes_a[:, np.newaxis, :2],
+            bboxes_b[:, :2]
+        )
+        # bottom right
+        br_intersect = torch.min(
+            bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:],
+            bboxes_b[:, :2] + bboxes_b[:, 2:]
+        )
+        bb_a = bboxes_a[:, 2:]
+        bb_b = bboxes_b[:, 2:]
+    
+    area_a = torch.prod(bb_a, 1)
+    area_b = torch.prod(bb_b, 1)
+    
+    # torch.prod(input, dim, keepdim=False, dtype=None) → Tensor
+    # Returns the product of each row of the input tensor in the given dimension dim
+    # if tl, br does not form a nondegenerate squre, then the corr. element in the `prod` would be 0
+    en = (tl_intersect < br_intersect).type(tl_intersect.type()).prod(dim=2)  # shape `(N,K,2)` ---> shape `(N,K)`
+
+    area_intersect = torch.prod(br_intersect - tl_intersect, 2) * en  # * ((tl < br).all())
+    area_union = (area_a[:, np.newaxis] + area_b - area_intersect)
+
+    iou = _true_divide(area_intersect, area_union)
+
+    if iou_type.lower() == 'iou':
+        return iou
+
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        # top left
+        tl_union = torch.min(
+            bboxes_a[:, np.newaxis, :2],
+            bboxes_b[:, :2]
+        ) # of shape `(N,K,2)`
+        # bottom right
+        br_union = torch.max(
+            bboxes_a[:, np.newaxis, 2:],
+            bboxes_b[:, 2:]
+        )
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        # top left
+        tl_union = torch.min(
+            bboxes_a[:, np.newaxis, :2] - bboxes_a[:, np.newaxis, 2:] / 2,
+            bboxes_b[:, :2] - bboxes_b[:, 2:] / 2
+        )
+        # bottom right
+        br_union = torch.max(
+            bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:] / 2,
+            bboxes_b[:, :2] + bboxes_b[:, 2:] / 2
+        )
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        # top left
+        tl_union = torch.min(
+            bboxes_a[:, np.newaxis, :2],
+            bboxes_b[:, :2]
+        )
+        # bottom right
+        br_union = torch.max(
+            bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:],
+            bboxes_b[:, :2] + bboxes_b[:, 2:]
+        )
+    
+    # c for covering, of shape `(N,K,2)`
+    # the last dim is box width, box hight
+    bboxes_c = br_union - tl_union
+
+    area_covering = torch.prod(bboxes_c, 2)  # shape `(N,K)`
+
+    giou = iou - _true_divide(area_covering - area_union, area_covering)
+
+    if iou_type.lower() == 'giou':
+        return giou
+
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        centre_a = (bboxes_a[..., 2 :] + bboxes_a[..., : 2]) / 2
+        centre_b = (bboxes_b[..., 2 :] + bboxes_b[..., : 2]) / 2
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        centre_a = bboxes_a[..., : 2]
+        centre_b = bboxes_b[..., : 2]
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        centre_a = bboxes_a[..., 2 :] + bboxes_a[..., : 2]/2
+        centre_b = bboxes_b[..., 2 :] + bboxes_b[..., : 2]/2
+
+    centre_dist = torch.norm(centre_a[:, np.newaxis] - centre_b, p='fro', dim=2)
+    diag_len = torch.norm(bboxes_c, p='fro', dim=2)
+
+    diou = iou - _true_divide(centre_dist.pow(2), diag_len.pow(2))
+
+    if iou_type.lower() == 'diou':
+        return diou
+
+    """ the legacy custom cosine similarity:
+
+    # bb_a of shape `(N,2)`, bb_b of shape `(K,2)`
+    v = torch.einsum('nm,km->nk', bb_a, bb_b)
+    v = _true_divide(v, (torch.norm(bb_a, p='fro', dim=1)[:,np.newaxis] * torch.norm(bb_b, p='fro', dim=1)))
+    # avoid nan for torch.acos near \pm 1
+    # https://github.com/pytorch/pytorch/issues/8069
+    eps = 1e-7
+    v = torch.clamp(v, -1+eps, 1-eps)
+    """
+    v = F.cosine_similarity(bb_a[:,np.newaxis,:], bb_b, dim=-1)
+    v = (_true_divide(2*torch.acos(v), np.pi)).pow(2)
+    with torch.no_grad():
+        alpha = (_true_divide(v, 1-iou+v)) * ((iou>=0.5).type(iou.type()))
+
+    ciou = diou - alpha * v
+
+    if iou_type.lower() == 'ciou':
+        return ciou
+
+
+def bboxes_giou(bboxes_a, bboxes_b, fmt='voc'):
+    return bboxes_iou(bboxes_a, bboxes_b, fmt, 'giou')
+
+
+def bboxes_diou(bboxes_a, bboxes_b, fmt='voc'):
+    return bboxes_iou(bboxes_a, bboxes_b, fmt, 'diou')
+
+
+def bboxes_ciou(bboxes_a, bboxes_b, fmt='voc'):
+    return bboxes_iou(bboxes_a, bboxes_b, fmt, 'ciou')
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/utils_iou_test.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/utils_iou_test.py
new file mode 100644
index 000000000..e15fff4f4
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/utils_iou_test.py
@@ -0,0 +1,296 @@
+# -*- coding: utf-8 -*-
+'''
+
+'''
+import torch
+import os, sys
+from torch.nn import functional as F
+from easydict import EasyDict as ED
+
+import numpy as np
+from packaging import version
+
+
+if version.parse(torch.__version__) >= version.parse('1.5.0'):
+    def _true_divide(dividend, divisor):
+        return torch.true_divide(dividend, divisor)
+else:
+    def _true_divide(dividend, divisor):
+        return dividend / divisor
+
+
+def bboxes_iou_test(bboxes_a, bboxes_b, fmt='voc', iou_type='iou'):
+    """
+    test function for the bboxes_iou function in `train_acne.py`,
+    with message printing and plot
+    """
+    if 'plt' not in dir():
+        import matplotlib.pyplot as plt
+    if 'cv2' not in dir():
+        try:
+            import cv2
+        except ModuleNotFoundError:
+            cv2 = None
+            from PIL import Image, ImageDraw
+    
+    assert iou_type.lower() in ['iou', 'giou', 'diou', 'ciou']
+
+    if isinstance(bboxes_a, np.ndarray):
+        bboxes_a = torch.Tensor(bboxes_a)
+    if isinstance(bboxes_b, np.ndarray):
+        bboxes_b = torch.Tensor(bboxes_b)
+    
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    N, K = bboxes_a.shape[0], bboxes_b.shape[0]
+    # if N, K all equal 1, then plot
+
+    # top left
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        # top left
+        tl_intersect = torch.max(bboxes_a[:, np.newaxis, :2], bboxes_b[:, :2]) # of shape `(N,K,2)`
+        # bottom right
+        br_intersect = torch.min(bboxes_a[:, np.newaxis, 2:], bboxes_b[:, 2:])
+        bb_a = bboxes_a[:, 2:] - bboxes_a[:, :2]  # w, h
+        bb_b = bboxes_b[:, 2:] - bboxes_b[:, :2]  # w, h
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        tl_intersect = torch.max((bboxes_a[:, np.newaxis, :2] - bboxes_a[:, np.newaxis, 2:] / 2),
+                       (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2))
+        # bottom right
+        br_intersect = torch.min((bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:] / 2),
+                       (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2))
+        bb_a = bboxes_a[:, 2:]
+        bb_b = bboxes_b[:, 2:]
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        # top left
+        tl_intersect = torch.max(bboxes_a[:, np.newaxis, :2], bboxes_b[:, :2])
+        # bottom right
+        br_intersect = torch.min((bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:]),
+                       (bboxes_b[:, :2] + bboxes_b[:, 2:]))
+        bb_a = bboxes_a[:, 2:]
+        bb_b = bboxes_b[:, 2:]
+
+    area_a = torch.prod(bb_a, 1)
+    area_b = torch.prod(bb_b, 1)
+
+    # torch.prod(input, dim, keepdim=False, dtype=None) → Tensor
+    # Returns the product of each row of the input tensor in the given dimension dim
+    # if tl, br does not form a nondegenerate squre, then the corr. element in the `prod` would be 0
+    en = (tl_intersect < br_intersect).type(tl_intersect.type()).prod(dim=2)  # shape `(N,K,2)` ---> shape `(N,K)`
+
+    area_intersect = torch.prod(br_intersect - tl_intersect, 2) * en  # * ((tl < br).all())
+    area_union = (area_a[:, np.newaxis] + area_b - area_intersect)
+
+    iou = _true_divide(area_intersect, area_union)
+
+    # if iou_type.lower() == 'iou':
+    #     return iou
+
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        # top left
+        tl_union = torch.min(bboxes_a[:, np.newaxis, :2], bboxes_b[:, :2]) # of shape `(N,K,2)`
+        # bottom right
+        br_union = torch.max(bboxes_a[:, np.newaxis, 2:], bboxes_b[:, 2:])
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        tl_union = torch.min((bboxes_a[:, np.newaxis, :2] - bboxes_a[:, np.newaxis, 2:] / 2),
+                       (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2))
+        # bottom right
+        br_union = torch.max((bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:] / 2),
+                       (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2))
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        # top left
+        tl_union = torch.min(bboxes_a[:, np.newaxis, :2], bboxes_b[:, :2])
+        # bottom right
+        br_union = torch.max((bboxes_a[:, np.newaxis, :2] + bboxes_a[:, np.newaxis, 2:]),
+                       (bboxes_b[:, :2] + bboxes_b[:, 2:]))
+    
+    # c for covering, of shape `(N,K,2)`
+    # the last dim is box width, box hight
+    bboxes_c = br_union - tl_union
+
+    area_covering = torch.prod(bboxes_c, 2)  # shape `(N,K)`
+
+    giou = iou - (area_covering - area_union) / area_covering
+
+    print(f"tl_union.shape = {tl_union.shape}")
+    print(f"br_union.shape = {br_union.shape}")
+    print(f"bboxes_c.shape = {bboxes_c.shape}")
+
+    # if iou_type.lower() == 'giou':
+    #     return giou
+
+    if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+        centre_a = (bboxes_a[..., 2 :] + bboxes_a[..., : 2]) / 2
+        centre_b = (bboxes_b[..., 2 :] + bboxes_b[..., : 2]) / 2
+    elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+        centre_a = (bboxes_a[..., : 2] + bboxes_a[..., 2 :]) / 2
+        centre_b = (bboxes_b[..., : 2] + bboxes_b[..., 2 :]) / 2
+    elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+        centre_a = bboxes_a[..., 2 :] + bboxes_a[..., : 2]/2
+        centre_b = bboxes_b[..., 2 :] + bboxes_b[..., : 2]/2
+
+    centre_dist = torch.norm(centre_a[:, np.newaxis] - centre_b, p='fro', dim=2)
+    diag_len = torch.norm(bboxes_c, p='fro', dim=2)
+
+    diou = iou - centre_dist.pow(2) / diag_len.pow(2)
+
+    # if iou_type.lower() == 'diou':
+    #     return diou
+
+    """ the legacy custom cosine similarity:
+
+    # bb_a of shape `(N,2)`, bb_b of shape `(K,2)`
+    v = torch.einsum('nm,km->nk', bb_a, bb_b)
+    v = _true_divide(v, (torch.norm(bb_a, p='fro', dim=1)[:,np.newaxis] * torch.norm(bb_b, p='fro', dim=1)))
+    # avoid nan for torch.acos near \pm 1
+    # https://github.com/pytorch/pytorch/issues/8069
+    eps = 1e-7
+    v = torch.clamp(v, -1+eps, 1-eps)
+    """
+    v = F.cosine_similarity(bb_a[:,np.newaxis,:], bb_b, dim=-1)
+    v = (_true_divide(2*torch.acos(v), np.pi)).pow(2)
+    alpha = (_true_divide(v, 1-iou+v))*((iou>=0.5).type(iou.type()))
+
+    ciou = diou - alpha * v
+
+    if N==K==1:
+        print("\n"+"*"*50)
+        print(f"bboxes_a = {bboxes_a}")
+        print(f"bboxes_b = {bboxes_b}")
+
+        print(f"area_a = {area_a}")
+        print(f"area_b = {area_b}")
+
+        print(f"area_intersect = {area_intersect}")
+        print(f"area_union = {area_union}")
+
+        print(f"tl_intersect = {tl_intersect}")
+        print(f"br_intersect = {br_intersect}")
+        print(f"tl_union = {tl_union}")
+        print(f"br_union = {br_union}")
+
+        print(f"area_covering (area of bboxes_c) = {area_covering}")
+        
+        print(f"centre_dist = {centre_dist}")
+        print(f"diag_len = {diag_len}")
+
+        print("for computing ciou")
+        inner_product = torch.einsum('nm,km->nk', bb_a, bb_b)
+        product_of_lengths = torch.norm(bb_a, p='fro', dim=1)[:,np.newaxis] * torch.norm(bb_b, p='fro', dim=1)
+        print(f"inner product of bb_a and bb_b is {inner_product}")
+        print(f"product of lengths of bb_a and bb_b is {product_of_lengths}")
+        print(f"inner product divided by product of lengths equals {_true_divide(inner_product, product_of_lengths)}")
+        print(f"normalized angle distance = {v}")
+        print(f"alpha = {alpha}")
+        print(f"v = {v}")
+        print(f"alpha = {alpha}")
+
+        bc = ED({"xmin":tl_union.numpy().astype(int)[0][0][0], "ymin":tl_union.numpy().astype(int)[0][0][1], "xmax":br_union.numpy().astype(int)[0][0][0], "ymax":br_union.numpy().astype(int)[0][0][1]})
+        adjust_x = bc.xmin - int(0.25*(bc.xmax-bc.xmin))
+        adjust_y = bc.ymin - int(0.25*(bc.ymax-bc.ymin))
+
+        print(f"adjust_x = {adjust_x}")
+        print(f"adjust_y = {adjust_y}")
+
+        bc.xmin, bc.ymin, bc.xmax, bc.ymax = bc.xmin-adjust_x, bc.ymin-adjust_y, bc.xmax-adjust_x, bc.ymax-adjust_y
+        
+        ba, bb = bboxes_a.numpy().astype(int)[0], bboxes_b.numpy().astype(int)[0]
+        if fmt.lower() == 'voc':  # xmin, ymin, xmax, ymax
+            ba = ED({"xmin":ba[0]-adjust_x, "ymin":ba[1]-adjust_y, "xmax":ba[2]-adjust_x, "ymax":ba[3]-adjust_y})
+            bb = ED({"xmin":bb[0]-adjust_x, "ymin":bb[1]-adjust_y, "xmax":bb[2]-adjust_x, "ymax":bb[3]-adjust_y})
+        elif fmt.lower() == 'yolo':  # xcen, ycen, w, h
+            ba = ED({"xmin":ba[0]-ba[2]//2-adjust_x, "ymin":ba[1]-ba[3]//2-adjust_y, "xmax":ba[0]+ba[2]//2-adjust_x, "ymax":ba[1]+ba[3]//2-adjust_y})
+            bb = ED({"xmin":bb[0]-bb[2]//2-adjust_x, "ymin":bb[1]-bb[3]//2-adjust_y, "xmax":bb[0]+bb[2]//2-adjust_x, "ymax":bb[1]+bb[3]//2-adjust_y})
+        elif fmt.lower() == 'coco':  # xmin, ymin, w, h
+            ba = ED({"xmin":ba[0]-adjust_x, "ymin":ba[1]-adjust_y, "xmax":ba[0]+ba[2]-adjust_x, "ymax":ba[1]+ba[3]-adjust_y})
+            bb = ED({"xmin":bb[0]-adjust_x, "ymin":bb[1]-adjust_y, "xmax":bb[0]+bb[2]-adjust_x, "ymax":bb[1]+bb[3]-adjust_y})
+
+        print(f"ba = {ba}")
+        print(f"bb = {bb}")
+        print(f"bc = {bc}")
+
+        plane = np.full(shape=(int(1.5*(bc.ymax-bc.ymin)),int(1.5*(bc.xmax-bc.xmin)),3), fill_value=255, dtype=np.uint8)
+        img_with_boxes = plane.copy()
+
+        line_size = 1
+        if cv2:
+            cv2.rectangle(img_with_boxes, (ba.xmin, ba.ymin), (ba.xmax, ba.ymax), (0, 255, 0), line_size)
+            cv2.rectangle(img_with_boxes, (bb.xmin, bb.ymin), (bb.xmax, bb.ymax), (0, 0, 255), line_size)
+            cv2.rectangle(img_with_boxes, (max(0,bc.ymin-1), max(0,bc.ymin-1)), (bc.xmax, bc.ymax), (255, 0, 0), line_size)
+        else:
+            img_with_boxes = Image.fromarray(img_with_boxes)
+            drawer = ImageDraw.Draw(img_with_boxes)
+            # drawer.line([(ba.xmin, ba.ymin), (ba.xmin, ba.ymax), (ba.xmax, ba.ymax), (ba.xmax, ba.ymin), (ba.xmin, ba.ymin)], fill='green', width=line_size)
+            # drawer.line([(bb.xmin, bb.ymin), (bb.xmin, bb.ymax), (bb.xmax, bb.ymax), (bb.xmax, bb.ymin), (bb.xmin, bb.ymin)], fill='blue', width=line_size)
+            # drawer.line([((max(0,bc.xmin-1), max(0,bc.ymin-1)), ((max(0,bc.xmin-1), bc.ymax), (bc.xmax, bc.ymax), (bc.xmax, max(0,bc.ymin-1)), ((max(0,bc.xmin-1), max(0,bc.ymin-1))], fill='red', width=line_size)
+            drawer.rectangle([(ba.xmin, ba.ymin), (ba.xmax, ba.ymax)], outline='green', width=line_size)
+            drawer.rectangle([(bb.xmin, bb.ymin), (bb.xmax, bb.ymax)], outline='blue', width=line_size)
+            drawer.rectangle([(max(0,bc.xmin-1), max(0,bc.ymin-1)), (bc.xmax+1, bc.ymax+1)], outline='red', width=line_size)
+            img_with_boxes = np.array(img_with_boxes)
+            del drawer
+
+        plt.figure(figsize=(7,7))
+        plt.imshow(img_with_boxes)
+        plt.show()
+
+        print(f"iou = {iou}")
+        print(f"giou = {giou}")
+        print(f"diou = {diou}")
+        print(f"ciou = {ciou}")
+
+    if iou_type.lower() == 'ciou':
+        return ciou
+    elif iou_type.lower() == 'diou':
+        return diou
+    elif iou_type.lower() == 'giou':
+        return giou
+    elif iou_type.lower() == 'iou':
+        return iou
+
+
+def original_iou_test(bboxes_a, bboxes_b, xyxy=True):
+    """
+    test function for the original iou function in `train.py`
+    """
+    if bboxes_a.shape[1] != 4 or bboxes_b.shape[1] != 4:
+        raise IndexError
+
+    if isinstance(bboxes_a, np.ndarray):
+        bboxes_a = torch.Tensor(bboxes_a)
+    if isinstance(bboxes_b, np.ndarray):
+        bboxes_b = torch.Tensor(bboxes_a)
+    
+    N, K = bboxes_a.shape[0], bboxes_b.shape[0]
+    # if N, K all equal 1, then plot
+    
+    # top left
+    if xyxy:
+        tl = torch.max(bboxes_a[:, None, :2], bboxes_b[:, :2])
+        # bottom right
+        br = torch.min(bboxes_a[:, None, 2:], bboxes_b[:, 2:])
+        area_a = torch.prod(bboxes_a[:, 2:] - bboxes_a[:, :2], 1)
+        area_b = torch.prod(bboxes_b[:, 2:] - bboxes_b[:, :2], 1)
+    else:
+        tl = torch.max((bboxes_a[:, None, :2] - bboxes_a[:, None, 2:] / 2),
+                       (bboxes_b[:, :2] - bboxes_b[:, 2:] / 2))
+        # bottom right
+        br = torch.min((bboxes_a[:, None, :2] + bboxes_a[:, None, 2:] / 2),
+                       (bboxes_b[:, :2] + bboxes_b[:, 2:] / 2))
+
+        area_a = torch.prod(bboxes_a[:, 2:], 1)
+        area_b = torch.prod(bboxes_b[:, 2:], 1)
+    en = (tl < br).type(tl.type()).prod(dim=2)
+    area_i = torch.prod(br - tl, 2) * en  # * ((tl < br).all())
+
+    print(f"tl.shape = {tl.shape}")
+    print(f"br.shape = {br.shape}")
+    print(f"area_a.shape = {area_a.shape}")
+    print(f"area_b.shape = {area_b.shape}")
+    print(f"en.shape = {en.shape}")
+    print(f"area_i.shape = {area_i.shape}")
+
+    if N == K == 1:
+        pass
+
+    return area_i / (area_a[:, None] + area_b - area_i)
diff --git a/models/object_detection/pytorch/yolov4/inference/gpu/tool/yolo_layer.py b/models/object_detection/pytorch/yolov4/inference/gpu/tool/yolo_layer.py
new file mode 100644
index 000000000..3af1fc63c
--- /dev/null
+++ b/models/object_detection/pytorch/yolov4/inference/gpu/tool/yolo_layer.py
@@ -0,0 +1,329 @@
+import torch.nn as nn
+import torch.nn.functional as F
+from tool.torch_utils import *
+
+def yolo_forward(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
+                              validation=False):
+    # Output would be invalid if it does not satisfy this assert
+    # assert (output.size(1) == (5 + num_classes) * num_anchors)
+
+    # print(output.size())
+
+    # Slice the second dimension (channel) of output into:
+    # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
+    # And then into
+    # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
+    batch = output.size(0)
+    H = output.size(2)
+    W = output.size(3)
+
+    bxy_list = []
+    bwh_list = []
+    det_confs_list = []
+    cls_confs_list = []
+
+    for i in range(num_anchors):
+        begin = i * (5 + num_classes)
+        end = (i + 1) * (5 + num_classes)
+        
+        bxy_list.append(output[:, begin : begin + 2])
+        bwh_list.append(output[:, begin + 2 : begin + 4])
+        det_confs_list.append(output[:, begin + 4 : begin + 5])
+        cls_confs_list.append(output[:, begin + 5 : end])
+
+    # Shape: [batch, num_anchors * 2, H, W]
+    bxy = torch.cat(bxy_list, dim=1)
+    # Shape: [batch, num_anchors * 2, H, W]
+    bwh = torch.cat(bwh_list, dim=1)
+
+    # Shape: [batch, num_anchors, H, W]
+    det_confs = torch.cat(det_confs_list, dim=1)
+    # Shape: [batch, num_anchors * H * W]
+    det_confs = det_confs.reshape(batch, num_anchors * H * W)
+
+    # Shape: [batch, num_anchors * num_classes, H, W]
+    cls_confs = torch.cat(cls_confs_list, dim=1)
+    # Shape: [batch, num_anchors, num_classes, H * W]
+    cls_confs = cls_confs.reshape(batch, num_anchors, num_classes, H * W)
+    # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 
+    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(batch, num_anchors * H * W, num_classes)
+
+    # Apply sigmoid(), exp() and softmax() to slices
+    #
+    bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
+    bwh = torch.exp(bwh)
+    det_confs = torch.sigmoid(det_confs)
+    cls_confs = torch.sigmoid(cls_confs)
+
+    # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
+    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, W - 1, W), axis=0).repeat(H, 0), axis=0), axis=0)
+    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, H - 1, H), axis=1).repeat(W, 1), axis=0), axis=0)
+    # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
+    # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
+
+    anchor_w = []
+    anchor_h = []
+    for i in range(num_anchors):
+        anchor_w.append(anchors[i * 2])
+        anchor_h.append(anchors[i * 2 + 1])
+
+    device = None
+    
+    #cuda_check = output.is_cuda
+    #if cuda_check:
+    #    device = output.get_device()
+
+    ##This is workaroud for pytorch, will change it to 'get_device' it after PYTORCHDGQ-423 is fixed.
+    device = output.device
+
+    bx_list = []
+    by_list = []
+    bw_list = []
+    bh_list = []
+
+    # Apply C-x, C-y, P-w, P-h
+    for i in range(num_anchors):
+        ii = i * 2
+        # Shape: [batch, 1, H, W]
+        bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        bw = bwh[:, ii : ii + 1] * anchor_w[i]
+        # Shape: [batch, 1, H, W]
+        bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
+
+        bx_list.append(bx)
+        by_list.append(by)
+        bw_list.append(bw)
+        bh_list.append(bh)
+
+
+    ########################################
+    #   Figure out bboxes from slices     #
+    ########################################
+    
+    # Shape: [batch, num_anchors, H, W]
+    bx = torch.cat(bx_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    by = torch.cat(by_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bw = torch.cat(bw_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bh = torch.cat(bh_list, dim=1)
+
+    # Shape: [batch, 2 * num_anchors, H, W]
+    bx_bw = torch.cat((bx, bw), dim=1)
+    # Shape: [batch, 2 * num_anchors, H, W]
+    by_bh = torch.cat((by, bh), dim=1)
+
+    # normalize coordinates to [0, 1]
+    bx_bw /= W
+    by_bh /= H
+
+    # Shape: [batch, num_anchors * H * W, 1]
+    bx = bx_bw[:, :num_anchors].reshape(batch, num_anchors * H * W, 1)
+    by = by_bh[:, :num_anchors].reshape(batch, num_anchors * H * W, 1)
+    bw = bx_bw[:, num_anchors:].reshape(batch, num_anchors * H * W, 1)
+    bh = by_bh[:, num_anchors:].reshape(batch, num_anchors * H * W, 1)
+
+    bx1 = bx - bw * 0.5
+    by1 = by - bh * 0.5
+    bx2 = bx1 + bw
+    by2 = by1 + bh
+
+    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
+    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).reshape(batch, num_anchors * H * W, 1, 4)
+    # boxes = boxes.repeat(1, 1, num_classes, 1)
+
+    # boxes:     [batch, num_anchors * H * W, 1, 4]
+    # cls_confs: [batch, num_anchors * H * W, num_classes]
+    # det_confs: [batch, num_anchors * H * W]
+
+    det_confs = det_confs.reshape(batch, num_anchors * H * W, 1)
+    confs = cls_confs * det_confs
+
+    # boxes: [batch, num_anchors * H * W, 1, 4]
+    # confs: [batch, num_anchors * H * W, num_classes]
+
+    return  boxes, confs
+
+
+def yolo_forward_dynamic(output, conf_thresh, num_classes, anchors, num_anchors, scale_x_y, only_objectness=1,
+                              validation=False):
+    # Output would be invalid if it does not satisfy this assert
+    # assert (output.size(1) == (5 + num_classes) * num_anchors)
+
+    # print(output.size())
+
+    # Slice the second dimension (channel) of output into:
+    # [ 2, 2, 1, num_classes, 2, 2, 1, num_classes, 2, 2, 1, num_classes ]
+    # And then into
+    # bxy = [ 6 ] bwh = [ 6 ] det_conf = [ 3 ] cls_conf = [ num_classes * 3 ]
+    # batch = output.size(0)
+    # H = output.size(2)
+    # W = output.size(3)
+
+    bxy_list = []
+    bwh_list = []
+    det_confs_list = []
+    cls_confs_list = []
+
+    for i in range(num_anchors):
+        begin = i * (5 + num_classes)
+        end = (i + 1) * (5 + num_classes)
+        
+        bxy_list.append(output[:, begin : begin + 2])
+        bwh_list.append(output[:, begin + 2 : begin + 4])
+        det_confs_list.append(output[:, begin + 4 : begin + 5])
+        cls_confs_list.append(output[:, begin + 5 : end])
+
+    # Shape: [batch, num_anchors * 2, H, W]
+    bxy = torch.cat(bxy_list, dim=1)
+    # Shape: [batch, num_anchors * 2, H, W]
+    bwh = torch.cat(bwh_list, dim=1)
+
+    # Shape: [batch, num_anchors, H, W]
+    det_confs = torch.cat(det_confs_list, dim=1)
+    # Shape: [batch, num_anchors * H * W]
+    det_confs = det_confs.reshape(output.size(0), num_anchors * output.size(2) * output.size(3))
+
+    # Shape: [batch, num_anchors * num_classes, H, W]
+    cls_confs = torch.cat(cls_confs_list, dim=1)
+    # Shape: [batch, num_anchors, num_classes, H * W]
+    cls_confs = cls_confs.reshape(output.size(0), num_anchors, num_classes, output.size(2) * output.size(3))
+    # Shape: [batch, num_anchors, num_classes, H * W] --> [batch, num_anchors * H * W, num_classes] 
+    cls_confs = cls_confs.permute(0, 1, 3, 2).reshape(output.size(0), num_anchors * output.size(2) * output.size(3), num_classes)
+
+    # Apply sigmoid(), exp() and softmax() to slices
+    #
+    bxy = torch.sigmoid(bxy) * scale_x_y - 0.5 * (scale_x_y - 1)
+    bwh = torch.exp(bwh)
+    det_confs = torch.sigmoid(det_confs)
+    cls_confs = torch.sigmoid(cls_confs)
+
+    # Prepare C-x, C-y, P-w, P-h (None of them are torch related)
+    grid_x = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(3) - 1, output.size(3)), axis=0).repeat(output.size(2), 0), axis=0), axis=0)
+    grid_y = np.expand_dims(np.expand_dims(np.expand_dims(np.linspace(0, output.size(2) - 1, output.size(2)), axis=1).repeat(output.size(3), 1), axis=0), axis=0)
+    # grid_x = torch.linspace(0, W - 1, W).reshape(1, 1, 1, W).repeat(1, 1, H, 1)
+    # grid_y = torch.linspace(0, H - 1, H).reshape(1, 1, H, 1).repeat(1, 1, 1, W)
+
+    anchor_w = []
+    anchor_h = []
+    for i in range(num_anchors):
+        anchor_w.append(anchors[i * 2])
+        anchor_h.append(anchors[i * 2 + 1])
+
+    device = None
+    #cuda_check = output.is_cuda
+    #if cuda_check:
+    #    device = output.get_device()
+
+    ##This is workaroud for pytorch, will change it to 'get_device' it after PYTORCHDGQ-423 is fixed.
+    device = output.device
+
+    bx_list = []
+    by_list = []
+    bw_list = []
+    bh_list = []
+
+    # Apply C-x, C-y, P-w, P-h
+    for i in range(num_anchors):
+        ii = i * 2
+        # Shape: [batch, 1, H, W]
+        bx = bxy[:, ii : ii + 1] + torch.tensor(grid_x, device=device, dtype=torch.float32) # grid_x.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        by = bxy[:, ii + 1 : ii + 2] + torch.tensor(grid_y, device=device, dtype=torch.float32) # grid_y.to(device=device, dtype=torch.float32)
+        # Shape: [batch, 1, H, W]
+        bw = bwh[:, ii : ii + 1] * anchor_w[i]
+        # Shape: [batch, 1, H, W]
+        bh = bwh[:, ii + 1 : ii + 2] * anchor_h[i]
+
+        bx_list.append(bx)
+        by_list.append(by)
+        bw_list.append(bw)
+        bh_list.append(bh)
+
+
+    ########################################
+    #   Figure out bboxes from slices     #
+    ########################################
+    
+    # Shape: [batch, num_anchors, H, W]
+    bx = torch.cat(bx_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    by = torch.cat(by_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bw = torch.cat(bw_list, dim=1)
+    # Shape: [batch, num_anchors, H, W]
+    bh = torch.cat(bh_list, dim=1)
+
+    # Shape: [batch, 2 * num_anchors, H, W]
+    bx_bw = torch.cat((bx, bw), dim=1)
+    # Shape: [batch, 2 * num_anchors, H, W]
+    by_bh = torch.cat((by, bh), dim=1)
+
+    # normalize coordinates to [0, 1]
+    bx_bw /= output.size(3)
+    by_bh /= output.size(2)
+
+    # Shape: [batch, num_anchors * H * W, 1]
+    bx = bx_bw[:, :num_anchors].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    by = by_bh[:, :num_anchors].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    bw = bx_bw[:, num_anchors:].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    bh = by_bh[:, num_anchors:].reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+
+    bx1 = bx - bw * 0.5
+    by1 = by - bh * 0.5
+    bx2 = bx1 + bw
+    by2 = by1 + bh
+
+    # Shape: [batch, num_anchors * h * w, 4] -> [batch, num_anchors * h * w, 1, 4]
+    boxes = torch.cat((bx1, by1, bx2, by2), dim=2).reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1, 4)
+    # boxes = boxes.repeat(1, 1, num_classes, 1)
+
+    # boxes:     [batch, num_anchors * H * W, 1, 4]
+    # cls_confs: [batch, num_anchors * H * W, num_classes]
+    # det_confs: [batch, num_anchors * H * W]
+
+    det_confs = det_confs.reshape(output.size(0), num_anchors * output.size(2) * output.size(3), 1)
+    confs = cls_confs * det_confs
+
+    # boxes: [batch, num_anchors * H * W, 1, 4]
+    # confs: [batch, num_anchors * H * W, num_classes]
+
+    return  boxes, confs
+
+class YoloLayer(nn.Module):
+    ''' Yolo layer
+    model_out: while inference,is post-processing inside or outside the model
+        true:outside
+    '''
+    def __init__(self, anchor_mask=[], num_classes=0, anchors=[], num_anchors=1, stride=32, model_out=False):
+        super(YoloLayer, self).__init__()
+        self.anchor_mask = anchor_mask
+        self.num_classes = num_classes
+        self.anchors = anchors
+        self.num_anchors = num_anchors
+        self.anchor_step = len(anchors) // num_anchors
+        self.coord_scale = 1
+        self.noobject_scale = 1
+        self.object_scale = 5
+        self.class_scale = 1
+        self.thresh = 0.6
+        self.stride = stride
+        self.seen = 0
+        self.scale_x_y = 1
+
+        self.model_out = model_out
+
+    def forward(self, output, target=None):
+        if self.training:
+            return output
+        masked_anchors = []
+        for m in self.anchor_mask:
+            masked_anchors += self.anchors[m * self.anchor_step:(m + 1) * self.anchor_step]
+        masked_anchors = [anchor / self.stride for anchor in masked_anchors]
+
+        return yolo_forward_dynamic(output, self.thresh, self.num_classes, masked_anchors, len(self.anchor_mask),scale_x_y=self.scale_x_y)
+
diff --git a/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/coco_detection_evaluator.py b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/coco_detection_evaluator.py
new file mode 100644
index 000000000..3d8738d5a
--- /dev/null
+++ b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/coco_detection_evaluator.py
@@ -0,0 +1,104 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+from inference import coco_tools
+from inference import coco_label_map
+
+class CocoDetectionEvaluator:
+    """Class to evaluate COCO detection metrics."""
+
+    def __init__(self):
+        self._image_ids = {}
+        self._groundtruth_list = []
+        self._detection_boxes_list = []
+        self._annotation_id = 1
+        self._category_id_set = set([cat for cat in coco_label_map.category_map])
+        self._groundtruth_list = []
+        self._detection_boxes_list = []
+
+    def add_single_ground_truth_image_info(self,
+                                           image_id,
+                                           groundtruth_dict):
+        if image_id in self._image_ids:
+            return
+        
+        self._groundtruth_list.extend(
+        coco_tools.ExportSingleImageGroundtruthToCoco(
+            image_id=image_id,
+            next_annotation_id=self._annotation_id,
+            category_id_set=self._category_id_set,
+            groundtruth_boxes=groundtruth_dict['boxes'],
+            groundtruth_classes=groundtruth_dict['classes']))
+        self._annotation_id += groundtruth_dict['boxes'].shape[0]
+        
+        self._image_ids[image_id] = False
+        is_debug = False
+        if image_id == '000000059386.jpg':
+            is_debug = True
+        if is_debug:
+            is_debug = False
+            print(groundtruth_dict['boxes'])
+            print(groundtruth_dict['classes'])
+            print(image_id)
+    
+    def add_single_detected_image_info(self,
+                                       image_id,
+                                       detections_dict):
+        assert (image_id in self._image_ids)
+        
+        if self._image_ids[image_id]:
+            return
+        
+        self._detection_boxes_list.extend(
+            coco_tools.ExportSingleImageDetectionBoxesToCoco(
+                image_id=image_id,
+                category_id_set=self._category_id_set,
+                detection_boxes=detections_dict['boxes'],
+                detection_scores=detections_dict['scores'],
+                detection_classes=detections_dict['classes']))
+
+        self._image_ids[image_id] = True
+        is_debug = False
+        if image_id == '000000059386.jpg':
+            is_debug = True
+        if is_debug:
+            is_debug = False
+            print(detections_dict['boxes'])
+            print(detections_dict['classes'])
+            print(detections_dict['classes'])
+            print(image_id)
+
+    def evaluate(self):
+        groundtruth_dict = {
+        'annotations': self._groundtruth_list,
+        'images': [{'id': image_id} for image_id in self._image_ids],
+        'categories': [{'id': k, 'name': v} for k, v in coco_label_map.category_map.items()]
+        }
+        coco_wrapped_groundtruth = coco_tools.COCOWrapper(groundtruth_dict)
+        coco_wrapped_detections = coco_wrapped_groundtruth.LoadAnnotations(
+            self._detection_boxes_list)
+        box_evaluator = coco_tools.COCOEvalWrapper(
+            coco_wrapped_groundtruth, coco_wrapped_detections, agnostic_mode=False)
+        box_metrics, box_per_category_ap = box_evaluator.ComputeMetrics(
+            include_metrics_per_category=False,
+            all_metrics_per_category=False)
+        box_metrics.update(box_per_category_ap)
+        box_metrics = {'DetectionBoxes_'+ key: value
+                       for key, value in iter(box_metrics.items())}
+        return box_metrics
diff --git a/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/coco_label_map.py b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/coco_label_map.py
new file mode 100644
index 000000000..6127c2aab
--- /dev/null
+++ b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/coco_label_map.py
@@ -0,0 +1,103 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2019 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+#
+
+category_map = {
+    1: 'person',
+    2: 'bicycle',
+    3: 'car',
+    4: 'motorcycle',
+    5: 'airplane',
+    6: 'bus',
+    7: 'train',
+    8: 'truck',
+    9: 'boat',
+    10: 'traffic light',
+    11: 'fire hydrant',
+    13: 'stop sign',
+    14: 'parking meter',
+    15: 'bench',
+    16: 'bird',
+    17: 'cat',
+    18: 'dog',
+    19: 'horse',
+    20: 'sheep',
+    21: 'cow',
+    22: 'elephant',
+    23: 'bear',
+    24: 'zebra',
+    25: 'giraffe',
+    27: 'backpack',
+    28: 'umbrella',
+    31: 'handbag',
+    32: 'tie',
+    33: 'suitcase',
+    34: 'frisbee',
+    35: 'skis',
+    36: 'snowboard',
+    37: 'sports ball',
+    38: 'kite',
+    39: 'baseball bat',
+    40: 'baseball glove',
+    41: 'skateboard',
+    42: 'surfboard',
+    43: 'tennis racket',
+    44: 'bottle',
+    46: 'wine glass',
+    47: 'cup',
+    48: 'fork',
+    49: 'knife',
+    50: 'spoon',
+    51: 'bowl',
+    52: 'banana',
+    53: 'apple',
+    54: 'sandwich',
+    55: 'orange',
+    56: 'broccoli',
+    57: 'carrot',
+    58: 'hot dog',
+    59: 'pizza',
+    60: 'donut',
+    61: 'cake',
+    62: 'chair',
+    63: 'couch',
+    64: 'potted plant',
+    65: 'bed',
+    67: 'dining table',
+    70: 'toilet',
+    72: 'tv',
+    73: 'laptop',
+    74: 'mouse',
+    75: 'remote',
+    76: 'keyboard',
+    77: 'cell phone',
+    78: 'microwave',
+    79: 'oven',
+    80: 'toaster',
+    81: 'sink',
+    82: 'refrigerator',
+    84: 'book',
+    85: 'clock',
+    86: 'vase',
+    87: 'scissors',
+    88: 'teddy bear',
+    89: 'hair drier',
+    90: 'toothbrush'
+}
+
+
diff --git a/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/coco_tools.py b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/coco_tools.py
new file mode 100644
index 000000000..58e9483b7
--- /dev/null
+++ b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/coco_tools.py
@@ -0,0 +1,530 @@
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Wrappers for third party pycocotools to be used within object_detection.
+
+Note that nothing in this file is tensorflow related and thus cannot
+be called directly as a slim metric, for example.
+
+TODO(jonathanhuang): wrap as a slim metric in metrics.py
+
+
+Usage example: given a set of images with ids in the list image_ids
+and corresponding lists of numpy arrays encoding groundtruth (boxes and classes)
+and detections (boxes, scores and classes), where elements of each list
+correspond to detections/annotations of a single image,
+then evaluation (in multi-class mode) can be invoked as follows:
+
+  groundtruth_dict = coco_tools.ExportGroundtruthToCOCO(
+      image_ids, groundtruth_boxes_list, groundtruth_classes_list,
+      max_num_classes, output_path=None)
+  detections_list = coco_tools.ExportDetectionsToCOCO(
+      image_ids, detection_boxes_list, detection_scores_list,
+      detection_classes_list, output_path=None)
+  groundtruth = coco_tools.COCOWrapper(groundtruth_dict)
+  detections = groundtruth.LoadAnnotations(detections_list)
+  evaluator = coco_tools.COCOEvalWrapper(groundtruth, detections,
+                                         agnostic_mode=False)
+  metrics = evaluator.ComputeMetrics()
+
+"""
+from collections import OrderedDict
+import copy
+import time
+import numpy as np
+
+from pycocotools import coco
+from pycocotools import cocoeval
+from pycocotools import mask
+
+import tensorflow as tf
+
+
+class COCOWrapper(coco.COCO):
+  """Wrapper for the pycocotools COCO class."""
+
+  def __init__(self, dataset, detection_type='bbox'):
+    """COCOWrapper constructor.
+
+    See http://mscoco.org/dataset/#format for a description of the format.
+    By default, the coco.COCO class constructor reads from a JSON file.
+    This function duplicates the same behavior but loads from a dictionary,
+    allowing us to perform evaluation without writing to external storage.
+
+    Args:
+      dataset: a dictionary holding bounding box annotations in the COCO format.
+      detection_type: type of detections being wrapped. Can be one of ['bbox',
+        'segmentation']
+
+    Raises:
+      ValueError: if detection_type is unsupported.
+    """
+    supported_detection_types = ['bbox', 'segmentation']
+    if detection_type not in supported_detection_types:
+      raise ValueError('Unsupported detection type: {}. '
+                       'Supported values are: {}'.format(
+                           detection_type, supported_detection_types))
+    self._detection_type = detection_type
+    coco.COCO.__init__(self)
+    self.dataset = dataset
+    self.createIndex()
+
+  def LoadAnnotations(self, annotations):
+    """Load annotations dictionary into COCO datastructure.
+
+    See http://mscoco.org/dataset/#format for a description of the annotations
+    format.  As above, this function replicates the default behavior of the API
+    but does not require writing to external storage.
+
+    Args:
+      annotations: python list holding object detection results where each
+        detection is encoded as a dict with required keys ['image_id',
+        'category_id', 'score'] and one of ['bbox', 'segmentation'] based on
+        `detection_type`.
+
+    Returns:
+      a coco.COCO datastructure holding object detection annotations results
+
+    Raises:
+      ValueError: if annotations is not a list
+      ValueError: if annotations do not correspond to the images contained
+        in self.
+    """
+    results = coco.COCO()
+    results.dataset['images'] = [img for img in self.dataset['images']]
+
+    tf.compat.v1.logging.info('Loading and preparing annotation results...')
+    tic = time.time()
+
+    if not isinstance(annotations, list):
+      raise ValueError('annotations is not a list of objects')
+    annotation_img_ids = [ann['image_id'] for ann in annotations]
+    if (set(annotation_img_ids) != (set(annotation_img_ids)
+                                    & set(self.getImgIds()))):
+      raise ValueError('Results do not correspond to current coco set')
+    results.dataset['categories'] = copy.deepcopy(self.dataset['categories'])
+    if self._detection_type == 'bbox':
+      for idx, ann in enumerate(annotations):
+        bb = ann['bbox']
+        ann['area'] = bb[2] * bb[3]
+        ann['id'] = idx + 1
+        ann['iscrowd'] = 0
+    elif self._detection_type == 'segmentation':
+      for idx, ann in enumerate(annotations):
+        ann['area'] = mask.area(ann['segmentation'])
+        ann['bbox'] = mask.toBbox(ann['segmentation'])
+        ann['id'] = idx + 1
+        ann['iscrowd'] = 0
+    tf.compat.v1.logging.info('DONE (t=%0.2fs)', (time.time() - tic))
+
+    results.dataset['annotations'] = annotations
+    results.createIndex()
+    return results
+
+
+class COCOEvalWrapper(cocoeval.COCOeval):
+  """Wrapper for the pycocotools COCOeval class.
+
+  To evaluate, create two objects (groundtruth_dict and detections_list)
+  using the conventions listed at http://mscoco.org/dataset/#format.
+  Then call evaluation as follows:
+
+    groundtruth = coco_tools.COCOWrapper(groundtruth_dict)
+    detections = groundtruth.LoadAnnotations(detections_list)
+    evaluator = coco_tools.COCOEvalWrapper(groundtruth, detections,
+                                           agnostic_mode=False)
+
+    metrics = evaluator.ComputeMetrics()
+  """
+
+  def __init__(self, groundtruth=None, detections=None, agnostic_mode=False,
+               iou_type='bbox'):
+    """COCOEvalWrapper constructor.
+
+    Note that for the area-based metrics to be meaningful, detection and
+    groundtruth boxes must be in image coordinates measured in pixels.
+
+    Args:
+      groundtruth: a coco.COCO (or coco_tools.COCOWrapper) object holding
+        groundtruth annotations
+      detections: a coco.COCO (or coco_tools.COCOWrapper) object holding
+        detections
+      agnostic_mode: boolean (default: False).  If True, evaluation ignores
+        class labels, treating all detections as proposals.
+      iou_type: IOU type to use for evaluation. Supports `bbox` or `segm`.
+    """
+    cocoeval.COCOeval.__init__(self, groundtruth, detections,
+                               iouType=iou_type)
+    if agnostic_mode:
+      self.params.useCats = 0
+
+  def GetCategory(self, category_id):
+    """Fetches dictionary holding category information given category id.
+
+    Args:
+      category_id: integer id
+    Returns:
+      dictionary holding 'id', 'name'.
+    """
+    return self.cocoGt.cats[category_id]
+
+  def GetAgnosticMode(self):
+    """Returns true if COCO Eval is configured to evaluate in agnostic mode."""
+    return self.params.useCats == 0
+
+  def GetCategoryIdList(self):
+    """Returns list of valid category ids."""
+    return self.params.catIds
+
+  def ComputeMetrics(self,
+                     include_metrics_per_category=False,
+                     all_metrics_per_category=False):
+    """Computes detection metrics.
+
+    Args:
+      include_metrics_per_category: If True, will include metrics per category.
+      all_metrics_per_category: If true, include all the summery metrics for
+        each category in per_category_ap. Be careful with setting it to true if
+        you have more than handful of categories, because it will pollute
+        your mldash.
+
+    Returns:
+      1. summary_metrics: a dictionary holding:
+        'Precision/mAP': mean average precision over classes averaged over IOU
+          thresholds ranging from .5 to .95 with .05 increments
+        'Precision/mAP@.50IOU': mean average precision at 50% IOU
+        'Precision/mAP@.75IOU': mean average precision at 75% IOU
+        'Precision/mAP (small)': mean average precision for small objects
+                        (area < 32^2 pixels)
+        'Precision/mAP (medium)': mean average precision for medium sized
+                        objects (32^2 pixels < area < 96^2 pixels)
+        'Precision/mAP (large)': mean average precision for large objects
+                        (96^2 pixels < area < 10000^2 pixels)
+        'Recall/AR@1': average recall with 1 detection
+        'Recall/AR@10': average recall with 10 detections
+        'Recall/AR@100': average recall with 100 detections
+        'Recall/AR@100 (small)': average recall for small objects with 100
+          detections
+        'Recall/AR@100 (medium)': average recall for medium objects with 100
+          detections
+        'Recall/AR@100 (large)': average recall for large objects with 100
+          detections
+      2. per_category_ap: a dictionary holding category specific results with
+        keys of the form: 'Precision mAP ByCategory/category'
+        (without the supercategory part if no supercategories exist).
+        For backward compatibility 'PerformanceByCategory' is included in the
+        output regardless of all_metrics_per_category.
+        If evaluating class-agnostic mode, per_category_ap is an empty
+        dictionary.
+
+    Raises:
+      ValueError: If category_stats does not exist.
+    """
+    self.evaluate()
+    self.accumulate()
+    self.summarize()
+
+    summary_metrics = OrderedDict([
+        ('Precision/mAP', self.stats[0]),
+        ('Precision/mAP@.50IOU', self.stats[1]),
+        ('Precision/mAP@.75IOU', self.stats[2]),
+        ('Precision/mAP (small)', self.stats[3]),
+        ('Precision/mAP (medium)', self.stats[4]),
+        ('Precision/mAP (large)', self.stats[5]),
+        ('Recall/AR@1', self.stats[6]),
+        ('Recall/AR@10', self.stats[7]),
+        ('Recall/AR@100', self.stats[8]),
+        ('Recall/AR@100 (small)', self.stats[9]),
+        ('Recall/AR@100 (medium)', self.stats[10]),
+        ('Recall/AR@100 (large)', self.stats[11])
+    ])
+    if not include_metrics_per_category:
+      return summary_metrics, {}
+    if not hasattr(self, 'category_stats'):
+      raise ValueError('Category stats do not exist')
+    per_category_ap = OrderedDict([])
+    if self.GetAgnosticMode():
+      return summary_metrics, per_category_ap
+    for category_index, category_id in enumerate(self.GetCategoryIdList()):
+      category = self.GetCategory(category_id)['name']
+      # Kept for backward compatilbility
+      per_category_ap['PerformanceByCategory/mAP/{}'.format(
+          category)] = self.category_stats[0][category_index]
+      if all_metrics_per_category:
+        per_category_ap['Precision mAP ByCategory/{}'.format(
+            category)] = self.category_stats[0][category_index]
+        per_category_ap['Precision mAP@.50IOU ByCategory/{}'.format(
+            category)] = self.category_stats[1][category_index]
+        per_category_ap['Precision mAP@.75IOU ByCategory/{}'.format(
+            category)] = self.category_stats[2][category_index]
+        per_category_ap['Precision mAP (small) ByCategory/{}'.format(
+            category)] = self.category_stats[3][category_index]
+        per_category_ap['Precision mAP (medium) ByCategory/{}'.format(
+            category)] = self.category_stats[4][category_index]
+        per_category_ap['Precision mAP (large) ByCategory/{}'.format(
+            category)] = self.category_stats[5][category_index]
+        per_category_ap['Recall AR@1 ByCategory/{}'.format(
+            category)] = self.category_stats[6][category_index]
+        per_category_ap['Recall AR@10 ByCategory/{}'.format(
+            category)] = self.category_stats[7][category_index]
+        per_category_ap['Recall AR@100 ByCategory/{}'.format(
+            category)] = self.category_stats[8][category_index]
+        per_category_ap['Recall AR@100 (small) ByCategory/{}'.format(
+            category)] = self.category_stats[9][category_index]
+        per_category_ap['Recall AR@100 (medium) ByCategory/{}'.format(
+            category)] = self.category_stats[10][category_index]
+        per_category_ap['Recall AR@100 (large) ByCategory/{}'.format(
+            category)] = self.category_stats[11][category_index]
+
+    return summary_metrics, per_category_ap
+
+
+def _ConvertBoxToCOCOFormat(box):
+  """Converts a box in [ymin, xmin, ymax, xmax] format to COCO format.
+
+  This is a utility function for converting from our internal
+  [ymin, xmin, ymax, xmax] convention to the convention used by the COCO API
+  i.e., [xmin, ymin, width, height].
+
+  Args:
+    box: a [ymin, xmin, ymax, xmax] numpy array
+
+  Returns:
+    a list of floats representing [xmin, ymin, width, height]
+  """
+  return [float(box[1]), float(box[0]), float(box[3] - box[1]),
+          float(box[2] - box[0])]
+
+
+def _RleCompress(masks):
+  """Compresses mask using Run-length encoding provided by pycocotools.
+
+  Args:
+    masks: uint8 numpy array of shape [mask_height, mask_width] with values in
+    {0, 1}.
+
+  Returns:
+    A pycocotools Run-length encoding of the mask.
+  """
+  return mask.encode(np.asfortranarray(masks))
+
+
+def ExportSingleImageGroundtruthToCoco(image_id,
+                                       next_annotation_id,
+                                       category_id_set,
+                                       groundtruth_boxes,
+                                       groundtruth_classes,
+                                       groundtruth_masks=None,
+                                       groundtruth_is_crowd=None):
+  """Export groundtruth of a single image to COCO format.
+
+  This function converts groundtruth detection annotations represented as numpy
+  arrays to dictionaries that can be ingested by the COCO evaluation API. Note
+  that the image_ids provided here must match the ones given to
+  ExportSingleImageDetectionsToCoco. We assume that boxes and classes are in
+  correspondence - that is: groundtruth_boxes[i, :], and
+  groundtruth_classes[i] are associated with the same groundtruth annotation.
+
+  In the exported result, "area" fields are always set to the area of the
+  groundtruth bounding box.
+
+  Args:
+    image_id: a unique image identifier either of type integer or string.
+    next_annotation_id: integer specifying the first id to use for the
+      groundtruth annotations. All annotations are assigned a continuous integer
+      id starting from this value.
+    category_id_set: A set of valid class ids. Groundtruth with classes not in
+      category_id_set are dropped.
+    groundtruth_boxes: numpy array (float32) with shape [num_gt_boxes, 4]
+    groundtruth_classes: numpy array (int) with shape [num_gt_boxes]
+    groundtruth_masks: optional uint8 numpy array of shape [num_detections,
+      image_height, image_width] containing detection_masks.
+    groundtruth_is_crowd: optional numpy array (int) with shape [num_gt_boxes]
+      indicating whether groundtruth boxes are crowd.
+
+  Returns:
+    a list of groundtruth annotations for a single image in the COCO format.
+
+  Raises:
+    ValueError: if (1) groundtruth_boxes and groundtruth_classes do not have the
+      right lengths or (2) if each of the elements inside these lists do not
+      have the correct shapes or (3) if image_ids are not integers
+  """
+
+  if len(groundtruth_classes.shape) != 1:
+    raise ValueError('groundtruth_classes is '
+                     'expected to be of rank 1.')
+  if len(groundtruth_boxes.shape) != 2:
+    raise ValueError('groundtruth_boxes is expected to be of '
+                     'rank 2.')
+  if groundtruth_boxes.shape[1] != 4:
+    raise ValueError('groundtruth_boxes should have '
+                     'shape[1] == 4.')
+  num_boxes = groundtruth_classes.shape[0]
+  if num_boxes != groundtruth_boxes.shape[0]:
+    raise ValueError('Corresponding entries in groundtruth_classes, '
+                     'and groundtruth_boxes should have '
+                     'compatible shapes (i.e., agree on the 0th dimension).'
+                     'Classes shape: %d. Boxes shape: %d. Image ID: %s' % (
+                         groundtruth_classes.shape[0],
+                         groundtruth_boxes.shape[0], image_id))
+  has_is_crowd = groundtruth_is_crowd is not None
+  if has_is_crowd and len(groundtruth_is_crowd.shape) != 1:
+    raise ValueError('groundtruth_is_crowd is expected to be of rank 1.')
+  groundtruth_list = []
+  for i in range(num_boxes):
+    if groundtruth_classes[i] in category_id_set:
+      iscrowd = groundtruth_is_crowd[i] if has_is_crowd else 0
+      export_dict = {
+          'id':
+              next_annotation_id + i,
+          'image_id':
+              image_id,
+          'category_id':
+              int(groundtruth_classes[i]),
+          'bbox':
+              list(_ConvertBoxToCOCOFormat(groundtruth_boxes[i, :])),
+          'area':
+              float((groundtruth_boxes[i, 2] - groundtruth_boxes[i, 0]) *
+                    (groundtruth_boxes[i, 3] - groundtruth_boxes[i, 1])),
+          'iscrowd':
+              iscrowd
+      }
+      if groundtruth_masks is not None:
+        export_dict['segmentation'] = _RleCompress(groundtruth_masks[i])
+      groundtruth_list.append(export_dict)
+  return groundtruth_list
+
+
+def ExportSingleImageDetectionBoxesToCoco(image_id,
+                                          category_id_set,
+                                          detection_boxes,
+                                          detection_scores,
+                                          detection_classes):
+  """Export detections of a single image to COCO format.
+
+  This function converts detections represented as numpy arrays to dictionaries
+  that can be ingested by the COCO evaluation API. Note that the image_ids
+  provided here must match the ones given to the
+  ExporSingleImageDetectionBoxesToCoco. We assume that boxes, and classes are in
+  correspondence - that is: boxes[i, :], and classes[i]
+  are associated with the same groundtruth annotation.
+
+  Args:
+    image_id: unique image identifier either of type integer or string.
+    category_id_set: A set of valid class ids. Detections with classes not in
+      category_id_set are dropped.
+    detection_boxes: float numpy array of shape [num_detections, 4] containing
+      detection boxes.
+    detection_scores: float numpy array of shape [num_detections] containing
+      scored for the detection boxes.
+    detection_classes: integer numpy array of shape [num_detections] containing
+      the classes for detection boxes.
+
+  Returns:
+    a list of detection annotations for a single image in the COCO format.
+
+  Raises:
+    ValueError: if (1) detection_boxes, detection_scores and detection_classes
+      do not have the right lengths or (2) if each of the elements inside these
+      lists do not have the correct shapes or (3) if image_ids are not integers.
+  """
+
+  if len(detection_classes.shape) != 1 or len(detection_scores.shape) != 1:
+    raise ValueError('All entries in detection_classes and detection_scores'
+                     'expected to be of rank 1.')
+  if len(detection_boxes.shape) != 2:
+    raise ValueError('All entries in detection_boxes expected to be of '
+                     'rank 2.')
+  if detection_boxes.shape[1] != 4:
+    raise ValueError('All entries in detection_boxes should have '
+                     'shape[1] == 4.')
+  num_boxes = detection_classes.shape[0]
+  if not num_boxes == detection_boxes.shape[0] == detection_scores.shape[0]:
+    raise ValueError('Corresponding entries in detection_classes, '
+                     'detection_scores and detection_boxes should have '
+                     'compatible shapes (i.e., agree on the 0th dimension). '
+                     'Classes shape: %d. Boxes shape: %d. '
+                     'Scores shape: %d' % (
+                         detection_classes.shape[0], detection_boxes.shape[0],
+                         detection_scores.shape[0]
+                     ))
+  detections_list = []
+  for i in range(num_boxes):
+    if detection_classes[i] in category_id_set:
+      detections_list.append({
+          'image_id': image_id,
+          'category_id': int(detection_classes[i]),
+          'bbox': list(_ConvertBoxToCOCOFormat(detection_boxes[i, :])),
+          'score': float(detection_scores[i])
+      })
+  return detections_list
+
+
+def ExportSingleImageDetectionMasksToCoco(image_id,
+                                          category_id_set,
+                                          detection_masks,
+                                          detection_scores,
+                                          detection_classes):
+  """Export detection masks of a single image to COCO format.
+
+  This function converts detections represented as numpy arrays to dictionaries
+  that can be ingested by the COCO evaluation API. We assume that
+  detection_masks, detection_scores, and detection_classes are in correspondence
+  - that is: detection_masks[i, :], detection_classes[i] and detection_scores[i]
+    are associated with the same annotation.
+
+  Args:
+    image_id: unique image identifier either of type integer or string.
+    category_id_set: A set of valid class ids. Detections with classes not in
+      category_id_set are dropped.
+    detection_masks: uint8 numpy array of shape [num_detections, image_height,
+      image_width] containing detection_masks.
+    detection_scores: float numpy array of shape [num_detections] containing
+      scores for detection masks.
+    detection_classes: integer numpy array of shape [num_detections] containing
+      the classes for detection masks.
+
+  Returns:
+    a list of detection mask annotations for a single image in the COCO format.
+
+  Raises:
+    ValueError: if (1) detection_masks, detection_scores and detection_classes
+      do not have the right lengths or (2) if each of the elements inside these
+      lists do not have the correct shapes or (3) if image_ids are not integers.
+  """
+
+  if len(detection_classes.shape) != 1 or len(detection_scores.shape) != 1:
+    raise ValueError('All entries in detection_classes and detection_scores'
+                     'expected to be of rank 1.')
+  num_boxes = detection_classes.shape[0]
+  if not num_boxes == len(detection_masks) == detection_scores.shape[0]:
+    raise ValueError('Corresponding entries in detection_classes, '
+                     'detection_scores and detection_masks should have '
+                     'compatible lengths and shapes '
+                     'Classes length: %d.  Masks length: %d. '
+                     'Scores length: %d' % (
+                         detection_classes.shape[0], len(detection_masks),
+                         detection_scores.shape[0]
+                     ))
+  detections_list = []
+  for i in range(num_boxes):
+    if detection_classes[i] in category_id_set:
+      detections_list.append({
+          'image_id': image_id,
+          'category_id': int(detection_classes[i]),
+          'segmentation': _RleCompress(detection_masks[i]),
+          'score': float(detection_scores[i])
+      })
+  return detections_list
diff --git a/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/infer_detections.py b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/infer_detections.py
index 506ca66d7..a976e3b13 100644
--- a/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/infer_detections.py
+++ b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/infer_detections.py
@@ -18,15 +18,18 @@
 #
 
 from __future__ import division
-
+import sys
 import tensorflow as tf
 from tensorflow.python.data.experimental import parallel_interleave
 from tensorflow.python.data.experimental import map_and_batch
+from tensorflow.python.framework import dtypes
 import time
-
+from tensorflow.python.client import timeline
 from argparse import ArgumentParser
-from inference.coco_detection_evaluator import CocoDetectionEvaluator
-from inference.coco_label_map import category_map
+from coco_detection_evaluator import CocoDetectionEvaluator
+from coco_label_map import category_map
+
+from optimize_for_benchmark import optimize_for_benchmark
 
 IMAGE_SIZE = 300
 COCO_NUM_VAL_IMAGES = 4952
@@ -118,6 +121,9 @@ def __init__(self):
     arg_parser.add_argument('-w', "--warmup_iter",
                             help='For accuracy measurement only.',
                             dest='warmup_iter', default=200, type=int)
+    arg_parser.add_argument("--benchmark",
+                            help='Run in benchmark mode.',
+                            dest='benchmark', action='store_true')                        
 
     # parse the arguments
     self.args = arg_parser.parse_args()
@@ -135,8 +141,9 @@ def __init__(self):
     input_layer = 'Preprocessor/subpart2'
     output_layers = ['num_detections', 'detection_boxes', 'detection_scores', 'detection_classes']
     self.input_tensor = self.infer_graph.get_tensor_by_name(input_layer + ":0")
-    self.output_tensors = [self.infer_graph.get_tensor_by_name(x + ":0") for x in output_layers]
-  
+    if not self.args.benchmark: 
+      self.output_tensors = [self.infer_graph.get_tensor_by_name(x + ":0") for x in output_layers]
+
     self.category_map_reverse = {v : k for k, v in category_map.items()}
 
   def build_data_sess(self):
@@ -149,7 +156,7 @@ def build_data_sess(self):
     preprocess_graph = tf.Graph()
     with preprocess_graph.as_default():
       graph_def = tf.compat.v1.GraphDef()
-      with tf.compat.v1.gfile.FastGFile(os.path.join(os.path.dirname(dir_path), 'ssdmobilenet_preprocess.pb'), 'rb') as input_file:
+      with tf.compat.v1.gfile.FastGFile(os.path.join(dir_path, 'ssdmobilenet_preprocess.pb'), 'rb') as input_file:
         input_graph_content = input_file.read()
         graph_def.ParseFromString(input_graph_content)
 
@@ -170,7 +177,15 @@ def load_graph(self):
         input_graph_content = input_file.read()
         graph_def.ParseFromString(input_graph_content)
 
-      tf.import_graph_def(graph_def, name='')
+      if self.args.benchmark:
+        input_shape = [self.args.batch_size, IMAGE_SIZE, IMAGE_SIZE, 3]
+        dummy_input = np.random.normal(0, 1, input_shape)
+        graph_def = optimize_for_benchmark(graph_def, dtypes.float32.as_datatype_enum, dummy_input)  
+        tf.import_graph_def(graph_def, name='')
+        output_layers = ['Postprocessor/Reshape_2', 'Postprocessor/convert_scores']
+        self.output_tensors = [tf.reshape(self.infer_graph.get_tensor_by_name(x + ":0"), [-1, 1])[0, :] for x in output_layers]
+      else:
+        tf.import_graph_def(graph_def, name='')  
 
   def run_benchmark(self):
     if self.args.data_location:
@@ -195,16 +210,19 @@ def run_benchmark(self):
 
       print('total iteration is {0}'.format(str(total_iter)))
       print('warm up iteration is {0}'.format(str(warmup_iter)))
-
       for step in range(total_iter):
         start_time = time.time()
         if self.args.data_location:
           input_images = self.data_sess.run([self.input_images])
           input_images = input_images[0]
           input_images = self.pre_sess.run(self.pre_output, {self.pre_input: input_images})
-        _ = sess.run(self.output_tensors, {self.input_tensor: input_images})
-        end_time = time.time()
 
+        if self.args.benchmark:
+            _ = sess.run(self.output_tensors)
+        else:
+            _ = sess.run(self.output_tensors, {self.input_tensor: input_images})
+            
+        end_time = time.time()
         duration = end_time - start_time
         if (step + 1) % 10 == 0:
           print('steps = {0}, {1} sec'.format(str(step), str(duration)))
@@ -275,8 +293,6 @@ def run(self):
       self.run_benchmark()
 
 
-
 if __name__ == "__main__":
   infer = model_infer()
   infer.run()
-
diff --git a/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/optimize_for_benchmark.py b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/optimize_for_benchmark.py
new file mode 100644
index 000000000..ff65baaa8
--- /dev/null
+++ b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/optimize_for_benchmark.py
@@ -0,0 +1,72 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+#
+
+# Copyright 2017 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.python.framework import tensor_util
+from tensorflow.python.tools.optimize_for_inference_lib import ensure_graph_is_valid
+
+import numpy as np
+
+def optimize_for_benchmark(input_graph_def, const_dtype, dummy_input):
+  ensure_graph_is_valid(input_graph_def)
+  optimized_graph_def = change_placehoder_to_const(input_graph_def, const_dtype, dummy_input)
+  ensure_graph_is_valid(input_graph_def)
+  return optimized_graph_def
+
+def change_placehoder_to_const(input_graph_def, const_dtype, dummy_input):
+  result_graph_def = graph_pb2.GraphDef()
+  for node in input_graph_def.node:
+    if node.op == 'Placeholder':
+      new_const = node_def_pb2.NodeDef()
+      new_const.op = 'Const'
+      new_const.name = node.name
+      new_const.attr["dtype"].CopyFrom(node.attr["dtype"])
+      tensor_proto = tensor_util.make_tensor_proto(dummy_input,
+                                                   const_dtype,
+                                                   dummy_input.shape)
+      new_const.attr["value"].tensor.CopyFrom(tensor_proto)
+      result_graph_def.node.extend([new_const])
+    else:
+      new_node = node_def_pb2.NodeDef()
+      new_node.CopyFrom(node)
+      retained_input = []
+      for input_node in new_node.input:
+        retained_input.append(input_node)
+      new_node.input[:] = retained_input
+
+      result_graph_def.node.extend([new_node])
+
+  return result_graph_def
diff --git a/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/ssdmobilenet_preprocess.pb b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/ssdmobilenet_preprocess.pb
new file mode 100644
index 000000000..e8b51b171
Binary files /dev/null and b/models/object_detection/tensorflow/ssd-mobilenet/inference/int8/ssdmobilenet_preprocess.pb differ
diff --git a/quickstart/common/pytorch/gpu/setvars.sh b/quickstart/common/pytorch/gpu/setvars.sh
new file mode 100644
index 000000000..6611d0861
--- /dev/null
+++ b/quickstart/common/pytorch/gpu/setvars.sh
@@ -0,0 +1,54 @@
+#
+# -*- coding: utf-8 -*-
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if [ -z "${ONEAPI_ROOT}" ]; then
+    echo "The ONEAPI_ROOT environment variable was not found. Please source the setvars.sh file before running this script."
+    echo "For example: source /opt/intel/oneapi/setvars.sh"
+    exit 1
+fi
+
+# intel dpcpp compiler
+source ${ONEAPI_ROOT}/compiler/latest/env/vars.sh
+
+# for oneMKL build specifically
+export MKL_DPCPP_ROOT=${ONEAPI_ROOT}/mkl/latest # or version
+export LD_LIBRARY_PATH=${MKL_DPCPP_ROOT}/lib:${MKL_DPCPP_ROOT}/lib64:${MKL_DPCPP_ROOT}/lib/intel64:${LD_LIBRARY_PATH}
+export LIBRARY_PATH=${MKL_DPCPP_ROOT}/lib:${MKL_DPCPP_ROOT}/lib64:${MKL_DPCPP_ROOT}/lib/intel64:$LIBRARY_PATH
+
+# Get the display info from lspci
+lspci_display_info=$(lspci | grep -i display)
+
+if [[ ${lspci_display_info} == *"Intel Corporation Device 0bd5"* ]]; then
+    export GPU_TYPE="PVC"
+
+    # for AOT
+    export USE_AOT_DEVLIST='pvc'
+
+    # HW L2 WA
+    export ForceStatelessL1CachingPolicy=1
+elif [[ ${lspci_display_info} == *"Intel Corporation Device 020a"* ]]; then
+    export GPU_TYPE="ATS"
+
+    # for AOT
+    export USE_AOT_DEVLIST='xe_hp_sdv'
+else
+    echo "Unrecognized GPU type: ${lscpi_display_info}"
+    echo "Expected to find Intel Corporation Device 0bd5 (PVC) or 020a (ATS-P) using the command: lspci | grep -i display"
+    echo "Please verify the system's configuration and driver setup."
+fi
+
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/DEVCATALOG_FLEX.md b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/DEVCATALOG_FLEX.md
new file mode 100644
index 000000000..bedba3de1
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/DEVCATALOG_FLEX.md
@@ -0,0 +1,102 @@
+# Running ResNet50 v1.5 Inference with Int8 on Intel® Data Center GPU Flex Series using Intel® Extension for PyTorch*
+
+
+## Overview
+
+This document has instructions for running ResNet50v1.5 inference using Intel(R) Extension for PyTorch with GPU.
+
+## Requirements
+| Item | Detail |
+| ------ | ------- |
+| Host machine  | Intel® Data Center GPU Flex Series  |
+| Drivers | GPU-compatible drivers need to be installed: [Download Driver 476.14](https://dgpu-docs.intel.com/releases/stable_476_14_20221021.html)
+| Software | Docker* Installed |
+
+## Get Started
+
+## Download Datasets
+
+The [ImageNet](http://www.image-net.org/) validation dataset is used.
+
+Download and extract the ImageNet2012 dataset from http://www.image-net.org/,
+then move validation images to labeled subfolders, using
+[the valprep.sh shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
+
+A after running the data prep script, your folder structure should look something like this:
+
+```
+imagenet
+└── val
+    ├── ILSVRC2012_img_val.tar
+    ├── n01440764
+    │   ├── ILSVRC2012_val_00000293.JPEG
+    │   ├── ILSVRC2012_val_00002138.JPEG
+    │   ├── ILSVRC2012_val_00003014.JPEG
+    │   ├── ILSVRC2012_val_00006697.JPEG
+    │   └── ...
+    └── ...
+```
+The folder that contains the `val` directory should be set as the
+`DATASET_DIR`
+(for example: `export DATASET_DIR=/home/<user>/imagenet`).
+
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `inference_block_format.sh` | Runs ResNet50 inference (block format) for the specified precision (int8) |
+
+## Run Using Docker
+
+### Set up Docker Image
+
+```
+docker pull intel/image-recognition:pytorch-flex-gpu-resnet50v1-5-inference
+```
+### Run Docker Image
+The ResNet50 v1-5 inference container includes scripts,model and libraries need to run int8 inference. To run the `inference_block_format.sh` quickstart script using this container, you'll need to provide volume mounts for the ImageNet dataset. You will need to provide an output directory where log files will be written. 
+
+```
+export PRECISION=int8
+export OUTPUT_DIR=<path to output directory>
+export DATASET_DIR=<path to the preprocessed imagenet dataset>
+export SCRIPT=quickstart/inference_block_format.sh 
+
+DOCKER_ARGS=${DOCKER_ARGS:---rm -it}
+IMAGE_NAME=intel/image-recognition:pytorch-flex-gpu-resnet50v1-5-inference
+
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  -v <your-local-dir>:/workspace \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --env PRECISION=${PRECISION} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  ${DOCKER_ARGS} \
+  ${IMAGE_NAME} \
+  /bin/bash $SCRIPT
+  ```
+
+## Documentation and Sources
+
+[GitHub* Repository](https://github.com/IntelAI/models/tree/master/dockerfiles/model_containers)
+
+## Support
+Support for Intel® Extension for PyTorch* is found via the [Intel® AI Analytics Toolkit.](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html#gs.qbretz) Additionally, the Intel® Extension for PyTorch* team tracks both bugs and enhancement requests using [GitHub issues](https://github.com/intel/intel-extension-for-pytorch/issues). Before submitting a suggestion or bug report, please search the GitHub issues to see if your issue has already been reported.
+
+## License Agreement
+
+LEGAL NOTICE: By accessing, downloading or using this software and any required dependent software (the “Software Package”), you agree to the terms and conditions of the software license agreements for the Software Package, which may also include notices, disclaimers, or license terms for third party software included with the Software Package. Please refer to the [license file](https://github.com/IntelAI/models/tree/master/third_party) for additional details.
\ No newline at end of file
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/DEVCATALOG_MAX.md b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/DEVCATALOG_MAX.md
new file mode 100644
index 000000000..7e1069f12
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/DEVCATALOG_MAX.md
@@ -0,0 +1,82 @@
+# PyTorch ResNet50_v1.5 Inference
+
+## Description 
+
+This document has instructions for running ResNet50 V1.5 Inference with INT8 precision using Intel Extension for PyTorch on Intel Max Series GPU. 
+
+## Datasets
+
+The [ImageNet](http://www.image-net.org/) validation dataset is used.
+
+Download and extract the ImageNet2012 dataset from http://www.image-net.org/, then move validation images to labeled subfolders, using [the valprep.sh shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
+
+After running the data prep script, your folder structure should look something like this:
+
+```
+imagenet
+└── val
+    ├── ILSVRC2012_img_val.tar
+    ├── n01440764
+    │   ├── ILSVRC2012_val_00000293.JPEG
+    │   ├── ILSVRC2012_val_00002138.JPEG
+    │   ├── ILSVRC2012_val_00003014.JPEG
+    │   ├── ILSVRC2012_val_00006697.JPEG
+    │   └── ...
+    └── ...
+```
+The folder that contains the `val` directory should be set as the
+`DATASET_DIR`
+(for example: `export DATASET_DIR=/home/<user>/imagenet`).
+
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `inference_block_format.sh` | Runs ResNet50 V1.5 INT8 inference (block format)|
+
+Requirements:
+* Host machine has Intel(R) Data Center Max Series GPU
+* Follow instructions to install GPU-compatible driver [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html#ubuntu-22-04)
+* Docker
+
+### Docker pull command:
+
+```
+docker pull intel/image-recognition:pytorch-max-gpu-resnet50v1-5-inference
+```
+The ResNet50 v1.5 inference container includes scripts,model and libraries need to run INT8 inference. To run the `inference_block_format.sh`  quickstart script using this container, you'll need to provide volume mounts for the ImageNet dataset. You will need to provide an output directory where log files will be written. 
+
+```
+export DATASET_DIR=${PWD}/imagenet
+export OUTPUT_DIR=${PWD}/logs
+
+IMAGE_NAME=intel/image-recognition:pytorch-max-gpu-resnet50v1-5-inference
+DOCKER_ARGS="--rm -it"
+
+SCRIPT=quickstart/inference_block_format.sh
+Tile=2
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env PRECISION=${PRECISION} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env Tile=${Tile} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  ${DOCKER_ARGS} \
+  $IMAGE_NAME \
+  /bin/bash $SCRIPT
+```
+
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/README_Flex_Series.md b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/README_Flex_Series.md
new file mode 100644
index 000000000..53a03ccb8
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/README_Flex_Series.md
@@ -0,0 +1,131 @@
+<!--- 0. Title -->
+# ResNet50v1.5 inference
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running ResNet50v1.5 inference using
+Intel(R) Extension for PyTorch with GPU.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Flex Series
+
+## Software Requirements:
+- Intel GPU Drivers: Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html)
+
+  |Release|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|
+    |v1.0.0|Intel® Data Center GPU Flex Series| Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal-dc.html) for latest driver installation. If install the verified Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html), please append the specific version after components, such as `apt-get install intel-opencl-icd=22.28.23726.1+i419~u20.04`|
+
+- Intel® oneAPI Base Toolkit 2022.3: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler and oneMKL in Ubuntu 20.04.
+
+    ```bash
+    wget https://registrationcenter-download.intel.com/akdlm/irc_nas/18852/l_BaseKit_p_2022.3.0.8767_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, Threading Building Blocks and oneMKL
+    sh ./l_BaseKit_p_2022.3.0.8767_offline.sh
+    ```
+    For any more details, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html.
+
+  - Set environment variables: 
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/setvars.sh
+    ```
+
+
+<!--- 30. Datasets -->
+## Datasets
+
+The [ImageNet](http://www.image-net.org/) validation dataset is used.
+
+Download and extract the ImageNet2012 dataset from http://www.image-net.org/,
+then move validation images to labeled subfolders, using
+[the valprep.sh shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
+
+A after running the data prep script, your folder structure should look something like this:
+
+```
+imagenet
+└── val
+    ├── ILSVRC2012_img_val.tar
+    ├── n01440764
+    │   ├── ILSVRC2012_val_00000293.JPEG
+    │   ├── ILSVRC2012_val_00002138.JPEG
+    │   ├── ILSVRC2012_val_00003014.JPEG
+    │   ├── ILSVRC2012_val_00006697.JPEG
+    │   └── ...
+    └── ...
+```
+The folder that contains the `val` directory should be set as the
+`DATASET_DIR`
+(for example: `export DATASET_DIR=/home/<user>/imagenet`).
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| inference_block_format.sh | Runs ResNet50 inference (block format) for the int8 precision |
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Python version 3.9
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install PyTorch and Intel® Extension for PyTorch for GPU (IPEX):
+  ```bash
+  python -m pip install torch==1.10.0a0 -f https://developer.intel.com/ipex-whl-stable-xpu
+  python -m pip install numpy==1.23.4
+  python -m pip install intel_extension_for_pytorch==1.10.200+gpu -f https://developer.intel.com/ipex-whl-stable-xpu
+  ```
+  To verify that PyTorch and IPEX are correctly installed:
+  ```bash
+  python -c "import torch;print(torch.device('xpu'))"  # Sample output: "xpu"
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())"  #Sample output True
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.has_onemkl())"  # Sample output: True
+  ```
+* Clone the Model Zoo repository:
+  ```bash
+  git clone https://github.com/IntelAI/models.git
+  ```
+* Navigate to ResNet50v1.5 inference directory and install model specific dependencies for the workload:
+  ```bash
+  # Navigate to the model zoo repo
+  cd models
+  cd quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu
+  ./setup.sh
+  cd -
+  ```
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading and preprocessing the ImageNet dataset. The path to the ImageNet
+dataset files will need to be set as the `DATASET_DIR` environment variable
+prior to running a [quickstart script](#quick-start-scripts).
+
+### Run the model on Baremetal
+Set environment variables for the path to your dataset, an output directory to run the quickstart script:
+```
+To run with ImageNet data, the dataset directory will need to be specified in addition to an output directory and precision.
+export DATASET_DIR=<path to the preprocessed imagenet dataset>
+export OUTPUT_DIR=<Path to save the output logs>
+
+# Optional envs
+export BATCH_SIZE=<Set batch_size else it will run with default batch>
+
+# Run a quickstart script
+./quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/inference_block_format.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/README_Max_Series.md b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/README_Max_Series.md
new file mode 100644
index 000000000..cb122882b
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/README_Max_Series.md
@@ -0,0 +1,137 @@
+<!--- 0. Title -->
+# ResNet50v1.5 inference
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running ResNet50v1.5 inference using
+Intel(R) Extension for PyTorch with GPU.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Max Series, Driver Version: [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+
+## Software Requirements:
+- Intel® Data Center GPU Max Series
+- Intel GPU Drivers: Intel® Data Center GPU Max Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+- Intel® oneAPI Base Toolkit 2023.0
+- Python 3.7-3.10
+- pip 19.0 or later (requires manylinux2014 support)
+
+  |Release|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|
+    |v1.1.0|Intel® Data Center GPU Max Series|  Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/index.html#intel-data-center-gpu-max-series) for latest driver installation. If install the verified Intel® Data Center GPU Max Series/Intel® Data Center GPU Flex Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html), please append the specific version after components.|
+
+- Intel® oneAPI Base Toolkit 2023.0.0: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Threading Building Blocks (oneTBB)
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler, oneTBB and oneMKL.
+    
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19079/l_BaseKit_p_2023.0.0.25537_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, oneTBB and oneMKL
+    $ sudo sh ./l_BaseKit_p_2023.0.0.25537_offline.sh
+    ```
+    For any more details on instructions on how to download and install the base-kit, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux&distributions=offline.
+
+  - Set environment variables
+    Default installation location `{ONEAPI_ROOT}` is `/opt/intel/oneapi` for root account, `${HOME}/intel/oneapi` for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/compiler/latest/env/vars.sh
+    source {ONEAPI_ROOT}/mkl/latest/env/vars.sh
+    source {ONEAPI_ROOT}/tbb/latest/env/vars.sh
+    ```
+
+
+<!--- 30. Datasets -->
+## Datasets
+
+The [ImageNet](http://www.image-net.org/) validation dataset is used.
+
+Download and extract the ImageNet2012 dataset from http://www.image-net.org/,
+then move validation images to labeled subfolders, using
+[the valprep.sh shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
+
+A after running the data prep script, your folder structure should look something like this:
+
+```
+imagenet
+└── val
+    ├── ILSVRC2012_img_val.tar
+    ├── n01440764
+    │   ├── ILSVRC2012_val_00000293.JPEG
+    │   ├── ILSVRC2012_val_00002138.JPEG
+    │   ├── ILSVRC2012_val_00003014.JPEG
+    │   ├── ILSVRC2012_val_00006697.JPEG
+    │   └── ...
+    └── ...
+```
+The folder that contains the `val` directory should be set as the
+`DATASET_DIR`
+(for example: `export DATASET_DIR=/home/<user>/imagenet`).
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| inference_block_format.sh | Runs ResNet50 inference (block format) for int8 precision |
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install PyTorch and Intel® Extension for PyTorch for GPU (IPEX):
+  ```bash
+  python -m pip install torch==1.13.0a0 -f https://developer.intel.com/ipex-whl-stable-xpu
+  python -m pip install intel_extension_for_pytorch==1.13.10+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
+  ```
+  To verify that PyTorch and IPEX are correctly installed:
+  ```bash
+  python -c "import torch;print(torch.device('xpu'))"  # Sample output: "xpu"
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())"  #Sample output True
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.has_onemkl())"  # Sample output: True
+  ```
+* Clone the Model Zoo repository:
+  ```bash
+  git clone https://github.com/IntelAI/models.git
+  ```
+* Navigate to ResNet50v1.5 inference directory and install model specific dependencies for the workload:
+  ```bash
+  # Navigate to the model zoo repo
+  cd models
+
+  cd quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu
+  ./setup.sh
+  cd -
+  ```
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading and preprocessing the ImageNet dataset. The path to the ImageNet
+dataset files will need to be set as the `DATASET_DIR` environment variable
+prior to running a [quickstart script](#quick-start-scripts).
+
+### Run the model on Baremetal
+Set environment variables for the path to your dataset, an output directory to run the quickstart script:
+```
+# To run with ImageNet data, the dataset directory will need to be specified in addition to an output directory and precision.
+export DATASET_DIR=<path to the preprocessed imagenet dataset>
+export OUTPUT_DIR=<Path to save the output logs>
+export Tile=2
+
+# Optional envs
+export BATCH_SIZE=<Set batch_size else it will run with default batch>
+
+# Run a quickstart script
+./quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/inference_block_format.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/build.sh b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/build.sh
new file mode 100755
index 000000000..8ce45aaf9
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/build.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+GPU_TYPE=$1
+
+PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE:-intel/intel-extension-for-pytorch}
+PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG:-xpu-max}
+
+if [[ $GPU_TYPE == max-series ]];then
+
+    IMAGE_NAME=${IMAGE_NAME:-intel/image-recognition:pytorch-max-gpu-resnet50v1-5-inference}
+    docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=pytorch-max-series-resnet50v1-5-inference \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE} \
+    --build-arg PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f pytorch-max-series-resnet50v1-5-inference.Dockerfile .
+elif [[ $GPU_TYPE == flex-series ]];then
+    IMAGE_NAME=${IMAGE_NAME:-intel/image-recognition:pytorch-flex-gpu-resnet50v1-5-inference}
+    docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=pytorch-atsm-resnet50v1-5-inference \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE} \
+    --build-arg PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f pytorch-atsm-resnet50v1-5-inference.Dockerfile .
+else
+    echo "Only flex-series or max-series GPU platforms supported"
+    exit 1
+fi
\ No newline at end of file
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/inference_block_format.sh b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/inference_block_format.sh
new file mode 100755
index 000000000..d7007d3b5
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/inference_block_format.sh
@@ -0,0 +1,120 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+NUM_ITERATIONS=${NUM_ITERATIONS-10}
+
+
+if [[ -z "${Tile}" ]]; then
+    Tile=${Tile-1}
+else
+    Tile=${Tile}
+fi
+
+if [[ -z "${DATASET_DIR}" ]]; then
+  echo "The required environment variable DATASET_DIR has not been set"
+  exit 1
+fi
+
+if [[ ! -d "${DATASET_DIR}" ]]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+
+if [[ -z $OUTPUT_DIR ]]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+# If batch size env is not mentioned, then the workload will run with the default batch size.
+if [ -z "${BATCH_SIZE}"]; then
+  BATCH_SIZE="1024"
+  echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+# Create the output directory, if it doesn't already exist
+mkdir -p $OUTPUT_DIR
+
+
+resnet50_log_analysis() {
+    # $1 : src raw log
+    # $2 : dst format log
+    # $3 : inference or training
+    # $4 : bs
+
+    bs=$4
+    if [ -f $2 ]; then
+        rm $2
+    fi
+
+    if [ "inference" == "$3" ]; then
+        echo -e 'Batch Size: ' $bs >$2
+        cat $1 | grep Test | tail -n6 | head -n5 |
+            awk -v bs=${bs} -F ' ' '{a+=$5}END{printf "Performance Benchmark Time: %.3f sec, Throughput: %.2f FPS\n", a/5, bs*5/a}' >>$2
+        cat $1 | tail -n2 | grep "Acc@1" | awk -F ' ' '{printf "Accuracy: acc@1 %.2f\n", $3}' >>$2
+    elif [ "training" == "$3" ]; then
+        echo -e 'Batch Size: ' $bs >$2
+        cat $1 | grep Epoch | tail -n1 | awk -v bs=${bs} -F ' ' '{printf "Performance Benchmark Time: %.3f sec, Throughput: %.2f FPS\n", $5, bs/$4}' >>$2
+    else
+        echo -e 'Invalid input! Only inference or training are supported.'
+        exit 0
+    fi
+}
+
+if [[ ${Tile} == "1" ]]; then
+    # int8 uses a different python script
+    echo "resnet50 int8 inference block"
+    IPEX_XPU_ONEDNN_LAYOUT=1 python -u models/image_recognition/pytorch/resnet50v1_5/inference/gpu/main.py \
+        -a resnet50 \
+        -b ${BATCH_SIZE} \
+        --xpu 0 \
+        -e \
+        --pretrained \
+        --int8 1 \
+        --num-iterations ${NUM_ITERATIONS} \
+        --benchmark 1 \
+        ${DATASET_DIR}  2>&1 | tee ${OUTPUT_DIR}//resnet50_int8_inf_block_t0_raw.log
+    resnet50_log_analysis ${OUTPUT_DIR}/resnet50_int8_inf_block_t0_raw.log ${OUTPUT_DIR}/resnet50_int8_inf_block_t0.log inference ${BATCH_SIZE}
+elif [[ ${Tile} == "2" ]]; then
+    echo "resnet50 int8 inference block two tile"
+    ZE_AFFINITY_MASK=0.0 IPEX_XPU_ONEDNN_LAYOUT=1 python -u models/image_recognition/pytorch/resnet50v1_5/inference/gpu/main.py \
+        -a resnet50 \
+        -b ${BATCH_SIZE} \
+        --xpu 0 \
+        -e \
+        --pretrained \
+        --int8 1 \
+        --num-iterations ${NUM_ITERATIONS} \
+        --benchmark 1 \
+        ${DATASET_DIR}  2>&1 | tee ${OUTPUT_DIR}//resnet50_int8_inf_block_t0_raw.log &
+    ZE_AFFINITY_MASK=0.1 IPEX_XPU_ONEDNN_LAYOUT=1 python -u models/image_recognition/pytorch/resnet50v1_5/inference/gpu/main.py \
+        -a resnet50 \
+        -b ${BATCH_SIZE} \
+        --xpu 0 \
+        -e \
+        --pretrained \
+        --int8 1 \
+        --num-iterations ${NUM_ITERATIONS} \
+        --benchmark 1 \
+        ${DATASET_DIR}  2>&1 | tee ${OUTPUT_DIR}//resnet50_int8_inf_block_t1_raw.log
+    resnet50_log_analysis ${OUTPUT_DIR}/resnet50_int8_inf_block_t0_raw.log ${OUTPUT_DIR}/resnet50_int8_inf_block_t0.log inference ${BATCH_SIZE}
+    resnet50_log_analysis ${OUTPUT_DIR}/resnet50_int8_inf_block_t1_raw.log ${OUTPUT_DIR}/resnet50_int8_inf_block_t1.log inference ${BATCH_SIZE}
+else
+    echo "The specified Tile '${Tile}' is unsupported."
+    echo "Supported tile number are: 1 and 2"
+    exit 1
+fi
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/setup.sh b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/setup.sh
new file mode 100755
index 000000000..c8bcb85ad
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/setup.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+pip install pillow
+pip install torchvision==0.14.0 --no-deps
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/aikit.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/aikit.md
new file mode 100644
index 000000000..e1aae4812
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/aikit.md
@@ -0,0 +1,46 @@
+<!--- 50. AI Kit -->
+## Run the model
+
+Requirements:
+* Host machine has Intel GPU.
+* Host machine has installed Linux kernel that is compatible with GPU drivers.
+* `lspci` (installed using `pciutils` from apt or yum)
+* Source the oneAPI AI Kit `setvars.sh` file (once per session)
+   ```
+   source /opt/intel/oneapi/setvars.sh
+   ```
+* Setup a conda environment with the dependencies needed to run SSD-ResNet34. Clone
+  the pytorch conda environment from AI Kit before running the setup script.
+  ```
+  # Create a clone of the AI Kit pytorch conda environment before running the setup.py script
+  conda create --clone pytorch --name ${USER}-pytorch-resnet50
+  conda activate ${USER}-pytorch-resnet50
+
+  # Navigate to the <model name> directory, and the setup.sh script from the quickstart folder
+  cd <package dir>
+  quickstart/setup.sh
+  ```
+  Note that the same conda environment can be used for both training and inference.
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading and extracting the ImageNet dataset.
+
+This snippet shows how to run a quickstart script using AI Kit. Before running
+<mode>, you'll need to make sure that you have all the requirements listed above,
+including the conda environment activated. Set environment variables for the path to
+your dataset, an output directory, and specify the precision to run.
+```
+# Navigate to the <model name> <mode> directory
+cd <package dir>
+
+# Activate the PyTorch <model name> conda environment
+conda activate ${USER}-pytorch-resnet50
+
+# Set environment vars for the dataset and an output directory
+export DATASET_DIR=<path the ImageNet directory>
+export OUTPUT_DIR=<directory where log files will be written>
+export PRECISION=<specify the precision to run (fp32 or bf16)>
+
+# Run a quickstart script
+quickstart/<script name>.sh
+```
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/baremetal.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/baremetal.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/datasets.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/datasets.md
new file mode 100644
index 000000000..8aa716555
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/datasets.md
@@ -0,0 +1,29 @@
+<!--- 30. Datasets -->
+## Datasets
+
+Download and extract the ImageNet2012 training and validation dataset from
+[http://www.image-net.org/](http://www.image-net.org/),
+then move validation images to labeled subfolders, using
+[the valprep.sh shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
+
+After running the data prep script and extracting the images, your folder structure
+should look something like this:
+```
+imagenet
+├── train
+│   ├── n02085620
+│   │   ├── n02085620_10074.JPEG
+│   │   ├── n02085620_10131.JPEG
+│   │   ├── n02085620_10621.JPEG
+│   │   └── ...
+│   └── ...
+└── val
+    ├── n01440764
+    │   ├── ILSVRC2012_val_00000293.JPEG
+    │   ├── ILSVRC2012_val_00002138.JPEG
+    │   ├── ILSVRC2012_val_00003014.JPEG
+    │   └── ...
+    └── ...
+```
+The folder that contains the `val` and `train` directories should be set as the
+`DATASET_DIR` environment variable before running the quickstart scripts.
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/description.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/description.md
new file mode 100644
index 000000000..1d4003781
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/description.md
@@ -0,0 +1,5 @@
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running <model name> <mode> using
+Intel(R) Extension for PyTorch with GPU.
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/docker.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/docker.md
new file mode 100644
index 000000000..a59660845
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/docker.md
@@ -0,0 +1,49 @@
+<!--- 60. Docker -->
+## Docker
+
+Requirements:
+* Host machine has Intel GPU.
+* Host machine has installed Linux kernel that is compatible with GPU drivers.
+* Host machine has Docker installed
+* Download and build the Intel® Extension for PyTorch (IPEX) container
+  (`model-zoo:pytorch-ipex-gpu`)
+
+Prior to building the <model name> <mode> container, ensure that you have
+built the IPEX container (`model-zoo:pytorch-ipex-gpu`).
+
+[Extract the package](#model-package), then use the `build.sh`
+script to build the container. After the container has been built, you can
+run the model <mode> using the `run.sh` script.
+Set environment variables for the path to [imagenet dataset](#datasets),
+the precision to run, and tan output directory for logs.
+
+The `run.sh` script will execute one of the [quickstart](#quick-start-scripts) script
+using the container that was just built. By default, the
+`training_block_format.sh` script will be run. To run a different script,
+specify the script name of the quickstart script using the `SCRIPT`
+environment variable. See the snippet below for an example.
+
+> Note: Ensure that your system has the proxy environment variables
+> set (if needed), otherwise the container build may fail when trying to pull external
+> dependencies (like apt-get and pip installs).
+
+```
+# Extract the package
+tar -xzf <package name>
+cd <package dir>
+
+# Build the container
+./build.sh
+
+# Set the required environment vars
+export DATASET_DIR=<path to the dataset>
+export PRECISION=<specify the precision to run (fp32 or bf16)>
+export OUTPUT_DIR=<directory where log files will be written>
+
+# Run the container with the default training_block_format.sh script
+./run.sh
+
+# Or, specify a different quickstart script to run with another precision
+export PRECISION=<precision to run>
+SCRIPT=training_plain_format.sh ./run.sh
+```
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/gpu_setup.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/gpu_setup.md
new file mode 100644
index 000000000..e69de29bb
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/license.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/license.md
new file mode 100644
index 000000000..da5269ad2
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/license.md
@@ -0,0 +1,4 @@
+<!--- 50. License -->
+## License
+
+[LICENSE](/LICENSE)
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/package.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/package.md
new file mode 100644
index 000000000..c2317f9ba
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/package.md
@@ -0,0 +1,21 @@
+<!--- 20. Model package -->
+## Model Package
+
+The model package includes the scripts and libraries needed to
+build and run <model name> <mode> using a docker container. Note that
+this model container uses the PyTorch IPEX GPU container as it's base,
+and it requires the `model-zoo:pytorch-ipex-gpu` image to be built before
+the model container is built.
+```
+<package dir>
+├── build.sh
+├── info.txt
+├── licenses
+│   ├── LICENSE
+│   └── third_party
+├── model_packages
+│   └── <package name>
+├── <package dir>.Dockerfile
+├── README.md
+└── run.sh
+```
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/quickstart.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/quickstart.md
new file mode 100644
index 000000000..9c2dd4ec4
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/quickstart.md
@@ -0,0 +1,8 @@
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| training_block_format_nchw.sh | Runs ResNet50 training (block format) using the ImageNet dataset for the specified precision (fp32 or bf16) using NCHW (channel first). |
+| training_plain_format_nchw.sh | Runs ResNet50 training (plain format) using the ImageNet dataset for the specified precision (fp32 or bf16) using NCHW (channel first). |
+| training_plain_format_nhwc.sh | Runs ResNet50 training (plain format) using the ImageNet dataset for the specified precision (fp32 or bf16) using NHWC (channel last). |
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/title.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/title.md
new file mode 100644
index 000000000..1d6c60187
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/.docs/title.md
@@ -0,0 +1,2 @@
+<!--- 0. Title -->
+# <model name> <mode>
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/DEVCATALOG.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/DEVCATALOG.md
new file mode 100644
index 000000000..89af73ab6
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/DEVCATALOG.md
@@ -0,0 +1,83 @@
+# PyTorch ResNet50_v1.5 Training
+
+## Description 
+This document has instructions for running ResNet50 v1.5 training with BFloat16 precision on Intel® Data Center GPU Max Series using Intel® Extension for PyTorch.
+
+## Datasets
+Download and extract the ImageNet2012 training and validation dataset from [http://www.image-net.org/ (http://www.image-net.org/),then move validation images to labeled subfolders, using
+[the valprep.sh shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
+
+After running the data prep script and extracting the images, your folder structure
+should look something like this:
+```
+imagenet
+├── train
+│   ├── n02085620
+│   │   ├── n02085620_10074.JPEG
+│   │   ├── n02085620_10131.JPEG
+│   │   ├── n02085620_10621.JPEG
+│   │   └── ...
+│   └── ...
+└── val
+    ├── n01440764
+    │   ├── ILSVRC2012_val_00000293.JPEG
+    │   ├── ILSVRC2012_val_00002138.JPEG
+    │   ├── ILSVRC2012_val_00003014.JPEG
+    │   └── ...
+    └── ...
+```
+The folder that contains the `val` and `train` directories should be set as the
+`DATASET_DIR` environment variable before running the quickstart scripts.
+
+## Quick Start Scripts
+| Script name | Description |
+|-------------|-------------|
+| `training_plain_format.sh` | Runs ResNet50 v1.5 BF16 training (plain format) on two tiles |
+| `ddp_training_plain_format_nchw.sh` | Runs ResNet50 v1.5 Distributed Data Parallel BF16 training on two tiles |
+
+Requirements:
+* Host machine has Intel(R) Data Center Max Series GPU
+* Follow instructions to install GPU-compatible driver [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html#ubuntu-22-04)
+* Docker
+
+### Docker pull command:
+```
+docker pull intel/image-recognition:pytorch-max-gpu-resnet50v1-5-training
+```
+The ResNet50 v1.5 training container includes scripts,model and libraries need to run BF16 training. To run the `ddp_training_plain_format_nchw.sh` quickstart script using this container, you'll need to provide volume mounts for the ImageNet dataset. You will need to provide an output directory where log files will be written. 
+
+```
+export DATASET_DIR=${PWD}/imagenet
+export OUTPUT_DIR=${PWD}/logs
+
+DOCKER_ARGS="--rm --init -it"
+IMAGE_NAME=intel/image-recognition:pytorch-max-gpu-resnet50v1-5-training
+
+export SCRIPT=quickstart/ddp_training_plain_format_nchw.sh
+export Tile=2
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --shm-size=10G \
+  --privileged \
+  --ipc=host \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env Tile=${Tile} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume /dev/dri:/dev/dri \
+  ${DOCKER_ARGS} \
+  $IMAGE_NAME \
+  /bin/bash $SCRIPT
+  ```
+
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/README.md b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/README.md
new file mode 100644
index 000000000..30f71d895
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/README.md
@@ -0,0 +1,147 @@
+<!--- 0. Title -->
+# ResNet50v1.5 training
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions to run ResNet50v1.5 training using Intel® Extension for PyTorch for GPU.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Max Series, Driver Version: [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+
+## Software Requirements:
+- Intel® Data Center GPU Max Series
+- Intel GPU Drivers: Intel® Data Center GPU Max Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+- Intel® oneAPI Base Toolkit 2023.0
+- Python 3.7-3.10
+- pip 19.0 or later (requires manylinux2014 support)
+
+  |Release|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|
+    |v1.1.0|Intel® Data Center GPU Max Series|  Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/index.html#intel-data-center-gpu-max-series) for latest driver installation. If install the verified Intel® Data Center GPU Max Series/Intel® Data Center GPU Flex Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html), please append the specific version after components.|
+
+- Intel® oneAPI Base Toolkit 2023.0.0: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Threading Building Blocks (oneTBB)
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler, oneTBB and oneMKL.
+    
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19079/l_BaseKit_p_2023.0.0.25537_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, oneTBB and oneMKL
+    $ sudo sh ./l_BaseKit_p_2023.0.0.25537_offline.sh
+    ```
+    For any more details on instructions on how to download and install the base-kit, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux&distributions=offline.
+
+  - Set environment variables
+    Default installation location `{ONEAPI_ROOT}` is `/opt/intel/oneapi` for root account, `${HOME}/intel/oneapi` for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/compiler/latest/env/vars.sh
+    source {ONEAPI_ROOT}/mkl/latest/env/vars.sh
+    source {ONEAPI_ROOT}/tbb/latest/env/vars.sh
+
+    # oneCCL (and Intel® oneAPI MPI Library as its dependency), required by Intel® Optimization for Horovod* only
+    source {ONEAPI_ROOT}/mpi/latest/env/vars.sh
+    source {ONEAPI_ROOT}/ccl/latest/env/vars.sh
+    ```
+
+<!--- 30. Datasets -->
+## Datasets
+
+Download and extract the ImageNet2012 training and validation dataset from
+[http://www.image-net.org/](http://www.image-net.org/),
+then move validation images to labeled subfolders, using
+[the valprep.sh shell script](https://raw.githubusercontent.com/soumith/imagenetloader.torch/master/valprep.sh)
+
+After running the data prep script and extracting the images, your folder structure
+should look something like this:
+```
+imagenet
+├── train
+│   ├── n02085620
+│   │   ├── n02085620_10074.JPEG
+│   │   ├── n02085620_10131.JPEG
+│   │   ├── n02085620_10621.JPEG
+│   │   └── ...
+│   └── ...
+└── val
+    ├── n01440764
+    │   ├── ILSVRC2012_val_00000293.JPEG
+    │   ├── ILSVRC2012_val_00002138.JPEG
+    │   ├── ILSVRC2012_val_00003014.JPEG
+    │   └── ...
+    └── ...
+```
+The folder that contains the `val` and `train` directories should be set as the
+`DATASET_DIR` environment variable before running the quickstart scripts.
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| training_plain_format.sh | Runs ResNet50 training (plain format) using the ImageNet dataset for bf16 using auto channel last one tile or two tile. |
+| ddp_training_plain_format_nchw.sh | Runs ResNet50 training (plain format) using the ImageNet dataset for bf16 using NCHW (channel first) with Distributed Deep Learning with Parameter Averaging. |
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install PyTorch and Intel® Extension for PyTorch for GPU (IPEX):
+  ```bash
+  python -m pip install torch==1.13.0a0 -f https://developer.intel.com/ipex-whl-stable-xpu
+  python -m pip install intel_extension_for_pytorch==1.13.10+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
+
+  # To run `ddp_training_plain_format_nchw.sh` oneccl_bind_pt is also needed:
+  python -m pip install oneccl_bind_pt==1.13.100+gpu -f https://developer.intel.com/ipex-whl-stable-xpu
+  ```
+  To verify that PyTorch and IPEX are correctly installed:
+  ```bash
+  python -c "import torch;print(torch.device('xpu'))"  # Sample output: "xpu"
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())"  #Sample output True
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.has_onemkl())"  # Sample output: True
+  ```
+* Clone the Model Zoo repository:
+  ```bash
+  git clone https://github.com/IntelAI/models.git
+  ```
+* Navigate to ResNet50v1.5 inference directory and install model specific dependencies for the workload:
+  ```bash
+  # Navigate to the model zoo repo
+  cd models
+
+  cd quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu
+  ./setup.sh
+  cd -
+  ```
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading and extracting the ImageNet dataset.
+
+### Run the model on Baremetal
+Set environment variables for the path to your dataset, an output directory to run the quickstart script:
+```
+# Set environment vars for the dataset and an output directory
+export DATASET_DIR=<path the ImageNet directory>
+export OUTPUT_DIR=<directory where log files will be written>
+
+# Optional envs
+export BATCH_SIZE=<Set batch_size else it will run with default batch>
+
+# Run a quickstart script:
+quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/ddp_training_plain_format_nchw.sh
+
+# Set `Tile` env variable only for `training_plain_format.sh` script
+export Tile=2
+quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/training_plain_format.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/build.sh b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/build.sh
new file mode 100755
index 000000000..64cde6ced
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/build.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE:-intel/intel-extension-for-pytorch}
+PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG:-xpu-max}
+IMAGE_NAME=${IMAGE_NAME:-intel/image-recognition:pytorch-max-gpu-resnet50v1-5-training}
+
+docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=pytorch-max-series-resnet50v1-5-training \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE} \
+    --build-arg PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f pytorch-max-series-resnet50v1-5-training.Dockerfile .
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/ddp_training_plain_format_nchw.sh b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/ddp_training_plain_format_nchw.sh
new file mode 100755
index 000000000..7a083330a
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/ddp_training_plain_format_nchw.sh
@@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+if [[ -z "${DATASET_DIR}" ]]; then
+  echo "The required environment variable DATASET_DIR has not been set"
+  exit 1
+fi
+
+if [[ ! -d "${DATASET_DIR}" ]]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+
+if [[ -z $OUTPUT_DIR ]]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+# If batch size env is not mentioned, then the workload will run with the default batch size.
+if [ -z "${BATCH_SIZE}"]; then
+  BATCH_SIZE="256"
+  echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+# Create the output directory, if it doesn't already exist
+mkdir -p $OUTPUT_DIR
+
+export LD_PRELOAD=/opt/intel/oneapi/lib/intel64/libmpi.so
+
+echo "explicit scaling hvd_resnet50 bf16 training plain nhwc bs16 perf 1c2t"
+cd ${MODEL_DIR}/models/image_recognition/pytorch/resnet50v1_5/training/gpu
+I_MPI_DEBUG=6 mpiexec -np 2 -ppn 2 python main.py -a resnet50 \
+    -b ${BATCH_SIZE} \
+    --xpu 0 \
+    ${DATASET_DIR} \
+    --num-iterations 20 \
+    --bucket-cap 200 \
+    --broadcast-buffers False \
+    --bf16 1 2>&1 | tee ${OUTPUT_DIR}/ddp-resnet50_bf16_train_block_nchw_1c2t_raw.log
+cp ${OUTPUT_DIR}/ddp-resnet50_bf16_train_block_nchw_1c2t_raw.log ${OUTPUT_DIR}/ddp-resnet50_bf16_train_block_nchw_1c2t.log
+cd -
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/setup.sh b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/setup.sh
new file mode 100755
index 000000000..eafc76b04
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/setup.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+pip install pillow
+pip install torchvision==0.11.2 --no-deps
diff --git a/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/training_plain_format.sh b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/training_plain_format.sh
new file mode 100755
index 000000000..0ac63b576
--- /dev/null
+++ b/quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/training_plain_format.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+# If batch size env is not mentioned, then the workload will run with the default batch size.
+if [ -z "${BATCH_SIZE}"]; then
+  BATCH_SIZE="256"
+  echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+if [[ -z "${Tile}" ]]; then
+    Tile=${Tile-1}
+else
+    Tile=${Tile}
+fi
+
+if [[ -z "${DATASET_DIR}" ]]; then
+  echo "The required environment variable DATASET_DIR has not been set"
+  exit 1
+fi
+
+if [[ ! -d "${DATASET_DIR}" ]]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+
+if [[ -z $OUTPUT_DIR ]]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+# Create the output directory, if it doesn't already exist
+mkdir -p $OUTPUT_DIR
+
+resnet50_log_analysis() {
+    # $1 : src raw log
+    # $2 : dst format log
+    # $3 : inference or training
+    # $4 : bs
+
+    bs=$4
+    if [ -f $2 ]; then
+        rm $2
+    fi
+
+    if [ "inference" == "$3" ]; then
+        echo -e 'Batch Size: ' $bs >$2
+        cat $1 | grep Test | tail -n6 | head -n5 |
+            awk -v bs=${bs} -F ' ' '{a+=$5}END{printf "Performance Benchmark Time: %.3f sec, Throughput: %.2f FPS\n", a/5, bs*5/a}' >>$2
+        cat $1 | tail -n2 | grep "Acc@1" | awk -F ' ' '{printf "Accuracy: acc@1 %.2f\n", $3}' >>$2
+    elif [ "training" == "$3" ]; then
+        echo -e 'Batch Size: ' $bs >$2
+        cat $1 | grep Epoch | tail -n1 | awk -v bs=${bs} -F ' ' '{printf "Performance Benchmark Time: %.3f sec, Throughput: %.2f FPS\n", $5, bs/$5}' >>$2
+    else
+        echo -e 'Invalid input! Only inference or training are supported.'
+        exit 0
+    fi
+}
+
+
+if [[ ${Tile} == "1" ]]; then
+    echo "resnet50 bf16 training plain"
+    python models/image_recognition/pytorch/resnet50v1_5/training/gpu/main.py \
+        -a resnet50 \
+        -b ${BATCH_SIZE} \
+        --xpu 0 \
+        ${DATASET_DIR} \
+        --num-iterations 20 \
+        --bf16 1 \
+        --seed 1 2>&1 | tee ${OUTPUT_DIR}/resnet50_bf16_train_plain_t0_raw.log
+    resnet50_log_analysis ${OUTPUT_DIR}/resnet50_bf16_train_plain_t0_raw.log ${OUTPUT_DIR}/resnet50_bf16_train_plain_t0.log training ${BATCH_SIZE}
+elif [[ ${Tile} == "2" ]]; then
+    echo "resnet50 bf16 training plain 2 tile"
+    ZE_AFFINITY_MASK=0.0 python models/image_recognition/pytorch/resnet50v1_5/training/gpu/main.py \
+        -a resnet50 \
+        -b ${BATCH_SIZE} \
+        --xpu 0 \
+        ${DATASET_DIR} \
+        --num-iterations 20 \
+        --bf16 1 2>&1 | tee ${OUTPUT_DIR}/resnet50_bf16_train_plain_t0_raw.log &
+    ZE_AFFINITY_MASK=0.1 python models/image_recognition/pytorch/resnet50v1_5/training/gpu/main.py \
+        -a resnet50 \
+        -b ${BATCH_SIZE} \
+        --xpu 0 \
+        ${DATASET_DIR} \
+        --num-iterations 20 \
+        --bf16 1 2>&1 | tee ${OUTPUT_DIR}/resnet50_bf16_train_plain_t1_raw.log
+    resnet50_log_analysis ${OUTPUT_DIR}/resnet50_bf16_train_plain_t0_raw.log ${OUTPUT_DIR}/resnet50_bf16_train_plain_t0.log training ${BATCH_SIZE}
+    resnet50_log_analysis ${OUTPUT_DIR}/resnet50_bf16_train_plain_t1_raw.log ${OUTPUT_DIR}/resnet50_bf16_train_plain_t1.log training ${BATCH_SIZE}
+else
+    echo "The specified Tile '${Tile}' is unsupported."
+    echo "Supported tile number are: 1 and 2"
+    exit 1
+fi
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/DEVCATALOG_FLEX.md b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/DEVCATALOG_FLEX.md
new file mode 100644
index 000000000..777b5a0ed
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/DEVCATALOG_FLEX.md
@@ -0,0 +1,89 @@
+# Running ResNet50 v1.5 Inference with Int8 on Intel® Data Center GPU Flex Series using Intel® Extension for TensorFlow*
+
+## Overview
+
+This document has instructions for running ResNet50 v1.5 inference using Intel® Extension for TensorFlow* with Intel® Data Center GPU Flex Series.
+
+
+## Requirements
+| Item | Detail |
+| ------ | ------- |
+| Host machine  | Intel® Data Center GPU Flex Series  |
+| Drivers | GPU-compatible drivers need to be installed: [Download Driver 476.14](https://dgpu-docs.intel.com/releases/stable_476_14_20221021.html)
+| Software | Docker* Installed |
+
+## Get Started
+
+### Download Datasets
+
+Download and preprocess the ImageNet dataset using the [instructions here](https://github.com/IntelAI/models/blob/master/datasets/imagenet/README.md).
+After running the conversion script you should have a directory with the
+ImageNet dataset in the TF records format.
+
+Set the `DATASET_DIR` to point to the TF records directory when running ResNet50 v1.5.
+
+### Quick Start Scripts
+
+| Script name | Description |
+|:-------------:|:-------------:|
+| `online_inference` | Runs online inference for int8 precision |
+| `batch_inference` | Runs batch inference for int8 precision |
+| `accuracy` | Measures the model accuracy for int8 precision |
+
+
+## Run Using Docker
+
+### Set up Docker Image
+
+```
+docker pull intel/image-recognition:tf-flex-gpu-resnet50v1-5-inference
+```
+
+### Run Docker Image
+The ResNet50 v1-5 inference container includes scripts,model and libraries need to run int8 inference. To run one of the inference quickstart scripts using this container, you'll need to provide volume mounts for the ImageNet dataset for running `accuracy.sh` script. For `online_inference.sh` and `batch_inference.sh` dummy dataset will be used. You will need to provide an output directory where log files will be written. 
+```
+export PRECISION=int8
+export OUTPUT_DIR=<path to output directory>
+export DATASET_DIR=<path to the preprocessed imagenet dataset>
+IMAGE_NAME=intel/image-recognition:tf-flex-gpu-resnet50v1-5-inference
+export GPU_TYPE=flex_series
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --privileged \
+  --env PRECISION=${PRECISION} \
+  --env GPU_TYPE=${GPU_TYPE} \ 
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  --rm -it \
+  $IMAGE_NAME \
+  /bin/bash quickstart/<script name>.sh
+```
+
+## Documentation and Sources
+
+[GitHub* Repository](https://github.com/IntelAI/models/tree/master/dockerfiles/model_containers)
+
+## Summary and Next Steps
+
+Now you are inside container with Python 3.9 and Tensorflow 2.10.0 preinstalled. You can run your own script
+to run on intel GPU. 
+
+## Support
+Support for Intel® Extension for TensorFlow* is found via the [Intel® AI Analytics Toolkit.](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html#gs.qbretz) Additionally, the Intel® Extension for TensorFlow* team tracks both bugs and enhancement requests using [GitHub issues](https://github.com/intel/intel-extension-for-tensorflow/issues). Before submitting a suggestion or bug report, please search the GitHub issues to see if your issue has already been reported.
+
+## License Agreement
+
+LEGAL NOTICE: By accessing, downloading or using this software and any required dependent software (the “Software Package”), you agree to the terms and conditions of the software license agreements for the Software Package, which may also include notices, disclaimers, or license terms for third party software included with the Software Package. Please refer to the [license file](https://github.com/IntelAI/models/tree/master/third_party) for additional details.
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/DEVCATALOG_MAX.md b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/DEVCATALOG_MAX.md
new file mode 100644
index 000000000..c60c2960f
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/DEVCATALOG_MAX.md
@@ -0,0 +1,74 @@
+# TensorFlow ResNet50_v1.5 Inference
+
+## Description
+
+This document has instructions for running ResNet50 V1.5 Inference with INT8,FP32 and FP16 precisions using Intel® Extension for TensorFlow on Intel® Max Series GPU.
+
+## Datasets
+
+Download and preprocess the ImageNet dataset using the [instructions here](/datasets/imagenet/README.md).After running the conversion script you should have a directory with the ImageNet dataset in the TF records format.
+
+Set the `DATASET_DIR` to point to the TF records directory when running ResNet50 v1.5. The folder that contains the `val` directory should be set as the `DATASET_DIR`
+(for example: `export DATASET_DIR=/home/<user>/imagenet`).
+
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `batch_inference` | Runs ResNet50 V1.5 batch inference for the precision set |
+| `accuracy` | Measures model accuracy for the precision set |
+| `online_inference` | Runs ResNet50 V1.5 online inference for the precision set |
+
+Requirements:
+* Host machine has Intel® Data Center Max Series GPU
+* Follow instructions to install GPU-compatible driver [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html#ubuntu-22-04)
+* Docker
+
+### Docker pull command:
+
+```
+docker pull intel/image-recognition:tf-max-gpu-resnet50v1-5-inference
+```
+The ResNet50 v1.5 inference container includes scripts,model and libraries need to run INT8,FP32 and FP16 inference. To run the quickstart scripts using this container, you'll need to provide volume mounts for the ImageNet dataset. For running accuracy test, you will need to provide the `DATASET_DIR` to point to the pre-processed ImageNet Dataset. For batch and online inference, the script uses dummy data. You will need to provide an output directory where log files will be written. 
+
+```
+export PRECISION=int8
+export OUTPUT_DIR=<path to output directory>
+export DATASET_DIR=<path to the preprocessed imagenet dataset>
+IMAGE_NAME=intel/image-recognition:tf-max-gpu-resnet50v1-5-inference
+DOCKER_ARGS="--rm -it"
+export SCRIPT=batch_inference.sh 
+export FROZEN_GRAPH=/workspace/tf-pvc-resnet50v1-5-inference/pretrained_models/resnet50v1_5-frozen_graph-${PRECISION}-gpu.pb
+export GPU_TYPE=max_series
+
+if [[ ${SCRIPT} == batch_inference.sh ]]; then
+   export Tile=2
+else
+   export Tile=1
+fi
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --privileged \
+  --env PRECISION=${PRECISION} \
+  --env GPU_TYPE=${GPU_TYPE} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env FROZEN_GRAPH=${FROZEN_GRAPH} \
+  --env Tile=${Tile} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  ${DOCKER_ARGS} \
+  $IMAGE_NAME \
+  /bin/bash quickstart/$SCRIPT
+  ```
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/README_Flex_series.md b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/README_Flex_series.md
new file mode 100644
index 000000000..d660025d9
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/README_Flex_series.md
@@ -0,0 +1,111 @@
+<!--- 0. Title -->
+# ResNet50 v1.5 inference
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running ResNet50 v1.5 inference using
+Intel® Extension for TensorFlow with Intel® Data Center GPU Flex Series.
+
+<!--- 20. GPU Setup -->
+## Requirements:
+- Intel GPU Drivers: Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html)
+
+  |Release|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|
+    |v1.0.0|Intel® Data Center GPU Flex Series| Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal-dc.html) for latest driver installation. If install the verified Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html), please append the specific version after components, such as `apt-get install intel-opencl-icd=22.28.23726.1+i419~u20.04`|
+
+- Intel® oneAPI Base Toolkit 2022.3: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler and oneMKL in Ubuntu 20.04.
+
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/18852/l_BaseKit_p_2022.3.0.8767_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, Threading Building Blocks and oneMKL
+    $ sh ./l_BaseKit_p_2022.3.0.8767_offline.sh
+    ```
+    For any more details, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html.
+
+  - Set environment variables
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/setvars.sh
+    ```
+
+<!--- 30. Datasets -->
+## Datasets
+
+Download and preprocess the ImageNet dataset using the [instructions here](/datasets/imagenet/README.md).
+After running the conversion script you should have a directory with the
+ImageNet dataset in the TF records format.
+
+Set the `DATASET_DIR` to point to the TF records directory when running ResNet50 v1.5.
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|:-------------:|:-------------:|
+| [`online_inference.sh`](online_inference.sh) | Runs online inference for int8 precision |
+| [`batch_inference.sh`](batch_inference.sh)| Runs batch inference for int8 precision |
+| [`accuracy.sh`](accuracy.sh) | Measures the model accuracy for int8 precision |
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install TensorFlow and Intel® Extension for TensorFlow (ITEX):
+
+  Intel® Extension for TensorFlow requires stock TensorFlow v2.10.0 to be installed.
+  
+  ```bash
+  pip install tensorflow==2.10.0
+  pip install --upgrade intel-extension-for-tensorflow[gpu]
+  ```
+   To verify that TensorFlow and ITEX are correctly installed:
+  ```
+  python -c "import intel_extension_for_tensorflow as itex; print(itex.__version__)"
+  ```
+* Download the frozen graph model file, and set the FROZEN_GRAPH environment variable to point to where it was saved:
+  ```bash
+  wget https://storage.googleapis.com/intel-optimized-tensorflow/models/gpu/resnet50v1_5_int8_h2d_avg_itex.pb
+  ```
+* Clone the Model Zoo repository:
+  ```bash
+  git clone https://github.com/IntelAI/models.git
+  ```
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading and preprocessing the ImageNet dataset. The path to the ImageNet
+TF records files will need to be set as the `DATASET_DIR` environment variable
+prior to running a [quickstart script](#quick-start-scripts).
+
+### Run the model on Baremetal
+Navigate to the ResNet50 v1.5 inference directory, and set environment variables:
+```
+cd models
+export DATASET_DIR=<path to the preprocessed imagenet dataset directory>
+export OUTPUT_DIR=<path where output log files will be written>
+export PRECISION=int8
+export FROZEN_GRAPH=<path to pretrained model file (*.pb)>
+export GPU_TYPE=flex_series
+
+# Optional envs
+export BATCH_SIZE=<Set batch_size else it will run with default batch>
+
+# Set 'DATASET_DIR' only when running "accuracy.sh" script:
+export DATASET_DIR=<path to the preprocessed imagenet dataset directory>
+
+Run quickstart script:
+./quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/<script name>.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/README_Max_Series.md b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/README_Max_Series.md
new file mode 100644
index 000000000..4e8a98072
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/README_Max_Series.md
@@ -0,0 +1,135 @@
+<!--- 0. Title -->
+# ResNet50 v1.5 inference
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running ResNet50 v1.5 inference using
+Intel® Extension for TensorFlow with Intel® Data Center GPU Max Series.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Max Series, Driver Version: [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+
+## Software Requirements:
+- Intel® Data Center GPU Max Series
+- Intel GPU Drivers: Intel® Data Center GPU Max Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+- Intel® oneAPI Base Toolkit 2023.0
+- TensorFlow 2.11.0 or 2.10.0
+- Python 3.7-3.10
+- pip 19.0 or later (requires manylinux2014 support)
+
+  |Release|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|
+    |v1.1.0|Intel® Data Center GPU Max Series|  Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/index.html#intel-data-center-gpu-max-series) for latest driver installation. If install the verified Intel® Data Center GPU Max Series/Intel® Data Center GPU Flex Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html), please append the specific version after components.|
+
+- Intel® oneAPI Base Toolkit 2023.0.0: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Threading Building Blocks (oneTBB)
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler, oneTBB and oneMKL.
+    
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19079/l_BaseKit_p_2023.0.0.25537_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, oneTBB and oneMKL
+    $ sudo sh ./l_BaseKit_p_2023.0.0.25537_offline.sh
+    ```
+    For any more details on instructions on how to download and install the base-kit, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux&distributions=offline.
+
+  - Set environment variables
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/compiler/latest/env/vars.sh
+    source {ONEAPI_ROOT}/mkl/latest/env/vars.sh
+    source {ONEAPI_ROOT}/tbb/latest/env/vars.sh
+    ```
+
+<!--- 30. Datasets -->
+## Datasets
+
+Download and preprocess the ImageNet dataset using the [instructions here](/datasets/imagenet/README.md).
+After running the conversion script you should have a directory with the
+ImageNet dataset in the TF records format.
+
+Set the `DATASET_DIR` to point to the TF records directory when running ResNet50 v1.5.
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|:-------------:|:-------------:|
+| [`online_inference.sh`](online_inference.sh) | Runs online inference for int8, fp16 and fp32 precisions. | 
+| [`batch_inference.sh`](batch_inference.sh)| Runs batch inference for int8, fp16 and fp32 precisions. |
+| [`accuracy.sh`](accuracy.sh) | Measures the model accuracy for int8, fp16 and fp32 precisions. |
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install TensorFlow and Intel® Extension for TensorFlow (ITEX):
+
+  The Intel® Extension for TensorFlow* requires stock TensorFlow, and the version should be == 2.11.0 or 2.10.0.
+
+  On Linux, it is often necessary to first update pip to a version that supports manylinux2014 wheels.
+  ```bash
+  pip install --upgrade pip
+  ```
+  
+  ```bash
+  pip install tensorflow==2.11.0
+  pip install --upgrade intel-extension-for-tensorflow[gpu]
+  ```
+   To verify that TensorFlow and ITEX are correctly installed:
+  ```
+  python -c "import intel_extension_for_tensorflow as itex; print(itex.__version__)"
+  ```
+  
+* Download the frozen graph model file, and set the FROZEN_GRAPH environment variable to point to where it was saved:
+  ```bash
+  # For fp32 and fp16:
+  wget https://storage.googleapis.com/intel-optimized-tensorflow/models/gpu/resnet50_v1.pb
+
+  # For int8:
+  wget https://storage.googleapis.com/intel-optimized-tensorflow/models/gpu/resnet50_v1_int8.pb
+  ```
+* Clone the Model Zoo repository:
+  ```bash
+  git clone https://github.com/IntelAI/models.git
+  ```
+
+Note that the dataset is required only to run the "accuracy.sh" script, "online_inference.sh" and "batch_inference.sh" scripts run with synthetic dataset. See the [datasets section](#datasets) of this document for instructions on
+downloading and preprocessing the ImageNet dataset. The path to the ImageNet
+TF records files will need to be set as the `DATASET_DIR` environment variable
+prior to running a [quickstart script](#quick-start-scripts).
+
+### Run the model on Baremetal
+Navigate to the ResNet50 v1.5 inference directory, and set environment variables:
+```
+cd models
+export OUTPUT_DIR=<path where output log files will be written>
+export PRECISION=<Set precision: int8, fp16 or fp32>
+export FROZEN_GRAPH=<path to pretrained model file (*.pb)>
+export GPU_TYPE=max_series
+
+# Optional envs
+export BATCH_SIZE=<Set batch_size else it will run with default batch>
+
+# Set 'Tile' env variable is only for running "batch_inference.sh" script:
+export Tile=2
+
+# Set 'DATASET_DIR' only when running "accuracy.sh" script, "online_inference.sh" and "batch_inference.sh" scripts run with synthetic dataset:
+export DATASET_DIR=<path to the preprocessed imagenet dataset directory>
+
+# Run quickstart script:
+./quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/<script name>.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/accuracy.sh b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/accuracy.sh
new file mode 100755
index 000000000..84b9c0c34
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/accuracy.sh
@@ -0,0 +1,109 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'PRECISION='$PRECISION
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo 'DATASET_DIR='$DATASET_DIR
+
+export TF_NUM_INTEROP_THREADS=1
+export DATA_NUM_INTER_THREADS=1
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_envs
+input_envs[PRECISION]=${PRECISION}
+input_envs[OUTPUT_DIR]=${OUTPUT_DIR}
+input_envs[DATASET_DIR]=${DATASET_DIR}
+input_envs[GPU_TYPE]=${GPU_TYPE}
+
+for i in "${!input_envs[@]}"; do
+  var_name=$i
+  env_param=${input_envs[$i]}
+ 
+  if [[ -z $env_param ]]; then
+    echo "The required environment variable $var_name is not set" >&2
+    exit 1
+  fi
+done
+
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+
+if [ ! -d "${DATASET_DIR}" ]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+
+# Check for GPU type
+if [[ $GPU_TYPE == "flex_series" ]]; then
+  if [[ $PRECISION == "int8" ]]; then
+    WARMUP="-- warmup_steps=5 steps=25"
+    if [[ ! -f "${FROZEN_GRAPH}" ]]; then
+      pretrained_model=/workspace/tf-atsm-resnet50v1-5-inference/pretrained_models/resnet50v1_5-frozen_graph-${PRECISION}-gpu.pb
+    else
+      pretrained_model=${FROZEN_GRAPH}
+    fi
+  else 
+    echo "FLEX SERIES GPU SUPPORTS ONLY INT8 PRECISION"
+    exit 1
+  fi
+  
+  # If batch size env is not mentioned, then the workload will run with the default batch size.
+  if [ -z "${BATCH_SIZE}"]; then
+    BATCH_SIZE=32
+    echo "Running with default batch size of ${BATCH_SIZE}"
+  fi
+elif [[ $GPU_TYPE == "max_series" ]]; then
+  if [[ $PRECISION == "int8" || $PRECISION == "fp16"  || $PRECISION == "fp32" ]]; then
+    WARMUP="-- warmup_steps=5 steps=20 disable-tcmalloc=True"
+    if [[ ! -f "${FROZEN_GRAPH}" ]]; then
+      pretrained_model=/workspace/tf-max-series-resnet50v1-5-inference/pretrained_models/resnet50v1_5-frozen_graph-${PRECISION}-gpu.pb
+    else
+      pretrained_model=${FROZEN_GRAPH}
+    fi
+  else 
+    echo "MAX SERIES GPU SUPPORTS ONLY INT8, FP32 AND FP16 PRECISION"
+    exit 1
+  fi
+  
+  # If batch size env is not mentioned, then the workload will run with the default batch size.
+  if [ -z "${BATCH_SIZE}"]; then
+    BATCH_SIZE="1024"
+    echo "Running with default batch size of ${BATCH_SIZE}"
+  fi
+fi
+
+if [[ $PRECISION == "fp16" ]]; then
+  export ITEX_AUTO_MIXED_PRECISION=1
+  export ITEX_AUTO_MIXED_PRECISION_DATA_TYPE="FLOAT16"
+fi
+
+source "${MODEL_DIR}/quickstart/common/utils.sh"
+_command python benchmarks/launch_benchmark.py \
+         --model-name=resnet50v1_5 \
+         --precision=${PRECISION} \
+         --mode=inference \
+         --framework tensorflow \
+         --in-graph ${pretrained_model} \
+         --data-location=${DATASET_DIR} \
+         --output-dir ${OUTPUT_DIR} \
+         --accuracy-only \
+         --batch-size=${BATCH_SIZE} \
+         --gpu \
+         $@
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/batch_inference.sh b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/batch_inference.sh
new file mode 100755
index 000000000..0d0e51f9b
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/batch_inference.sh
@@ -0,0 +1,137 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'PRECISION='$PRECISION
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+
+export TF_NUM_INTEROP_THREADS=1
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_envs
+input_envs[PRECISION]=${PRECISION}
+input_envs[OUTPUT_DIR]=${OUTPUT_DIR}
+input_envs[GPU_TYPE]=${GPU_TYPE}
+
+for i in "${!input_envs[@]}"; do
+  var_name=$i
+  env_param=${input_envs[$i]}
+ 
+  if [[ -z $env_param ]]; then
+    echo "The required environment variable $var_name is not set" >&2
+    exit 1
+  fi
+done
+
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+
+# If batch size env is not mentioned, then the workload will run with the default batch size.
+if [ -z "${BATCH_SIZE}"]; then
+  BATCH_SIZE="1024"
+  echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+# Check for GPU type
+if [[ $GPU_TYPE == "flex_series" ]]; then
+  if [[ $PRECISION == "int8" ]]; then
+    echo "Precision is $PRECISION"
+    if [[ ! -f "${FROZEN_GRAPH}" ]]; then
+      pretrained_model=/workspace/tf-atsm-resnet50v1-5-inference/pretrained_models/resnet50v1_5-frozen_graph-${PRECISION}-gpu.pb
+    else
+      pretrained_model=${FROZEN_GRAPH}
+    fi
+    WARMUP="-- warmup_steps=5 steps=25"
+  else 
+    echo "FLEX SERIES GPU SUPPORTS ONLY INT8 PRECISION"
+    exit 1
+  fi
+elif [[ $GPU_TYPE == "max_series" ]]; then
+  if [[ $PRECISION == "int8" || $PRECISION == "fp16" || $PRECISION == "fp32" ]]; then
+    echo "Precision is $PRECISION"
+    if [[ ! -f "${FROZEN_GRAPH}" ]]; then
+      pretrained_model=/workspace/tf-max-series-resnet50v1-5-inference/pretrained_models/resnet50v1_5-frozen_graph-${PRECISION}-gpu.pb
+    else
+      pretrained_model=${FROZEN_GRAPH}
+    fi
+    WARMUP="-- warmup_steps=5 steps=20 disable-tcmalloc=True"
+  else 
+    echo "MAX SERIES GPU SUPPORTS ONLY INT8, FP32 AND FP16 PRECISION"
+    exit 1
+  fi
+fi
+
+if [[ $PRECISION == "fp16" ]]; then
+  export ITEX_AUTO_MIXED_PRECISION=1
+  export ITEX_AUTO_MIXED_PRECISION_DATA_TYPE="FLOAT16"
+fi
+
+if [[ -z "${Tile}" ]]; then
+    Tile=${Tile-1}
+else
+    Tile=${Tile}
+fi
+
+source "${MODEL_DIR}/quickstart/common/utils.sh"
+if [[ ${Tile} == "1" ]]; then
+    echo "resnet50 v1.5 int8 inference"
+         python benchmarks/launch_benchmark.py \
+         --model-name=resnet50v1_5 \
+         --precision=${PRECISION} \
+         --mode=inference \
+         --framework tensorflow \
+         --in-graph ${pretrained_model} \
+         --output-dir ${OUTPUT_DIR} \
+         --batch-size=${BATCH_SIZE} \
+         --benchmark-only \
+         --gpu \
+         $@ \
+         ${WARMUP} 2>&1 | tee ${OUTPUT_DIR}//resnet50_${PRECISION}_inf_t0_raw.log
+
+elif [[ ${Tile} == "2" ]]; then
+        echo "resnet50 v1.5 int8 two-tile inference"
+        ZE_AFFINITY_MASK=0.0 python benchmarks/launch_benchmark.py \
+         --model-name=resnet50v1_5 \
+         --precision=${PRECISION} \
+         --mode=inference \
+         --framework tensorflow \
+         --in-graph ${pretrained_model} \
+         --output-dir ${OUTPUT_DIR} \
+         --batch-size=${BATCH_SIZE} \
+         --benchmark-only \
+         --gpu \
+         $@ \
+         ${WARMUP} 2>&1 | tee ${OUTPUT_DIR}//resnet50_${PRECISION}_inf_t0_raw.log &
+         ZE_AFFINITY_MASK=0.1 python benchmarks/launch_benchmark.py \
+         --model-name=resnet50v1_5 \
+         --precision=${PRECISION} \
+         --mode=inference \
+         --framework tensorflow \
+         --in-graph ${pretrained_model} \
+         --output-dir ${OUTPUT_DIR} \
+         --batch-size=${BATCH_SIZE} \
+         --benchmark-only \
+         --gpu \
+         $@ \
+         ${WARMUP} 2>&1 | tee ${OUTPUT_DIR}//resnet50_${PRECISION}_inf_t1_raw.log 
+else
+    echo"Only Tiles 1 and 2 supported."
+    exit 1
+fi
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/build.sh b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/build.sh
new file mode 100755
index 000000000..7647e8979
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/build.sh
@@ -0,0 +1,54 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+GPU_TYPE=$1
+
+TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE:-intel/intel-extension-for-tensorflow}
+TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG:-gpu-max}
+if [[ $GPU_TYPE == max-series ]];then
+    IMAGE_NAME=${IMAGE_NAME:-intel/image-recognition:tf-max-gpu-resnet50v1-5-inference}
+    docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=tf-max-series-resnet50v1-5-inference \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE} \
+    --build-arg TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f tf-max-series-resnet50v1-5-inference.Dockerfile .
+
+elif [[ $GPU_TYPE == flex-series ]];then
+    IMAGE_NAME=${IMAGE_NAME:-intel/image-recognition:tf-flex-gpu-resnet50v1-5-inference}
+    docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=tf-atsm-resnet50v1-5-inference \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE} \
+    --build-arg TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f tf-atsm-resnet50v1-5-inference.Dockerfile .
+
+else
+    echo "Provide GPU_TYPE option.Only flex-series or max-series GPU platforms supported"
+    exit 1
+fi
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/online_inference.sh b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/online_inference.sh
new file mode 100755
index 000000000..b69d0ff7b
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/online_inference.sh
@@ -0,0 +1,98 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'PRECISION='$PRECISION
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo performance | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+
+export TF_NUM_INTEROP_THREADS=1
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_envs
+input_envs[PRECISION]=${PRECISION}
+input_envs[OUTPUT_DIR]=${OUTPUT_DIR}
+input_envs[GPU_TYPE]=${GPU_TYPE}
+
+for i in "${!input_envs[@]}"; do
+  var_name=$i
+  env_param=${input_envs[$i]}
+ 
+  if [[ -z $env_param ]]; then
+    echo "The required environment variable $var_name is not set" >&2
+    exit 1
+  fi
+done
+
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+
+WARMUP=""
+
+# If batch size env is not mentioned, then the workload will run with the default batch size.
+if [ -z "${BATCH_SIZE}"]; then
+  BATCH_SIZE="1"
+  echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+# Check for GPU type
+if [[ $GPU_TYPE == "flex_series" ]]; then
+  if [[ $PRECISION == "int8" ]]; then
+    WARMUP="-- warmup_steps=5 steps=25"
+    if [[ ! -f "${FROZEN_GRAPH}" ]]; then
+      pretrained_model=/workspace/tf-atsm-resnet50v1-5-inference/pretrained_models/resnet50v1_5-frozen_graph-${PRECISION}-gpu.pb
+    else
+      pretrained_model=${FROZEN_GRAPH}
+    fi
+  else 
+    echo "FLEX SERIES GPU SUPPORTS ONLY INT8 PRECISION"
+    exit 1
+  fi
+elif [[ $GPU_TYPE == "max_series" ]]; then
+  if [[ $PRECISION == "int8" || $PRECISION == "fp16" || $PRECISION == "fp32" ]]; then
+    WARMUP="-- warmup_steps=5 steps=20 disable-tcmalloc=True"
+    if [[ ! -f "${FROZEN_GRAPH}" ]]; then
+      pretrained_model=/workspace/tf-max-series-resnet50v1-5-inference/pretrained_models/resnet50v1_5-frozen_graph-${PRECISION}-gpu.pb
+    else
+      pretrained_model=${FROZEN_GRAPH}
+    fi
+  else 
+    echo "MAX SERIES GPU SUPPORTS ONLY INT8, FP32 AND FP16 PRECISION"
+    exit 1
+  fi
+fi
+
+if [[ $PRECISION == "fp16" ]]; then
+  export ITEX_AUTO_MIXED_PRECISION=1
+  export ITEX_AUTO_MIXED_PRECISION_DATA_TYPE="FLOAT16"
+fi
+
+source "${MODEL_DIR}/quickstart/common/utils.sh"
+_command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+    --in-graph ${pretrained_model} \
+    --model-name resnet50v1_5 \
+    --framework tensorflow \
+    --precision ${PRECISION} \
+    --mode inference \
+    --batch-size ${BATCH_SIZE} \
+    --output-dir ${OUTPUT_DIR} \
+    --benchmark-only \
+    --gpu \
+    $@ \
+    ${WARMUP}
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/DEVCATALOG.md b/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/DEVCATALOG.md
new file mode 100644
index 000000000..232c24e3e
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/DEVCATALOG.md
@@ -0,0 +1,69 @@
+# TensorFlow ResNet50_v1.5 Training
+
+## Description
+
+This document has instructions for running ResNet50 v1.5 training with BFloat16 precision using Intel® Extension for TensorFlow on Intel® Data Center GPU Max Series.
+
+## Datasets
+
+Download and preprocess the ImageNet dataset using the [instructions here](datasets/imagenet/README.md). After running the conversion script you should have a directory with the ImageNet dataset in the TF records format.
+
+Set the `DATASET_DIR` to point to the TF records directory when running ResNet50 v1.5.
+
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `bfloat16_training_full.sh` | bfloat16 precision script for ResNet50 v1.5 training on two tiles |
+| `bfloat16_training_hvd.sh`| bfloat16 precision script for ResNet50 v1.5 with Intel® Optimization for Horovod* support on two tiles |
+
+Requirements:
+* Host machine has Intel(R) Data Center Max Series GPU
+* Follow instructions to install GPU-compatible driver [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html#ubuntu-22-04)
+* Docker
+
+### Docker pull command:
+
+```
+docker pull intel/image-recognition:tf-max-gpu-resnet50v1-5-training
+```
+The ResNet50 v1.5 training container includes scripts,model and libraries need to run BFloat16 Training. To run the quickstart scripts using this container, you'll need to provide volume mounts for the ImageNet dataset.
+
+```
+export PRECISION=bfloat16
+export OUTPUT_DIR=<path to log file directory>
+export DATASET_DIR=<path to ImageNet dataset>
+
+IMAGE_NAME=intel/image-recognition:tf-max-gpu-resnet50v1-5-training
+DOCKER_ARGS="--rm -it"
+export SCRIPT=bfloat16_training_hvd.sh
+
+if [[ ${SCRIPT} == bfloat16_training_full.sh ]]; then
+   export Tile=2
+else
+   export Tile=1
+fi
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run --rm \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env PRECISION=${PRECISION} \
+  --env Tile=${Tile} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  --volume /dev/dri:/dev/dri \
+  ${DOCKER_ARGS} \
+  $IMAGE_NAME \
+  /bin/bash quickstart/$SCRIPT
+  ```
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/README.md b/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/README.md
new file mode 100644
index 000000000..3ac4e5bf7
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/README.md
@@ -0,0 +1,124 @@
+<!--- 0. Title -->
+# ResNet50 v1.5 Training
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running ResNet50 v1.5 training using
+Intel® Extension for TensorFlow* with Intel® Data Center GPU Max Series.
+
+<!--- 20. GPU Setup -->
+## Requirements:
+- Intel® Data Center GPU Max Series
+- Intel GPU Drivers: Intel® Data Center GPU Max Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+- Intel® oneAPI Base Toolkit 2023.0
+- TensorFlow 2.11.0 or 2.10.0
+- Python 3.7-3.10
+
+  |Release|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|
+    |v1.1.0|Intel® Data Center GPU Max Series|  Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/index.html#intel-data-center-gpu-max-series) for latest driver installation. If install the verified Intel® Data Center GPU Max Series/Intel® Data Center GPU Flex Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html), please append the specific version after components.|
+
+- Intel® oneAPI Base Toolkit 2023.0.0: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Threading Building Blocks (oneTBB)
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  - Intel® oneAPI Collective Communications Library (oneCCL), required by Intel® Optimization for Horovod* only
+  * Download and install the verified DPC++ compiler, oneTBB and oneMKL.
+    
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19079/l_BaseKit_p_2023.0.0.25537_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, oneTBB and oneMKL
+    # if you want to run distributed training with Intel® Optimization for Horovod*, oneCCL is needed too(Intel® oneAPI MPI Library will be installed automatically as its dependency)
+    $ sudo sh ./l_BaseKit_p_2023.0.0.25537_offline.sh
+    ```
+    For any more details on instructions on how to download and install the base-kit, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux&distributions=offline.
+
+  - Set environment variables
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/compiler/latest/env/vars.sh
+    source {ONEAPI_ROOT}/mkl/latest/env/vars.sh
+    source {ONEAPI_ROOT}/tbb/latest/env/vars.sh
+
+    # oneCCL (and Intel® oneAPI MPI Library as its dependency), required by Intel® Optimization for Horovod* only
+    source {ONEAPI_ROOT}/mpi/latest/env/vars.sh
+    source {ONEAPI_ROOT}/ccl/latest/env/vars.sh
+    ```
+
+<!--- 30. Datasets -->
+## Datasets
+
+Download and preprocess the ImageNet dataset using the [instructions here](datasets/imagenet/README.md).
+After running the conversion script you should have a directory with the
+ImageNet dataset in the TF records format.
+
+Set the `DATASET_DIR` to point to the TF records directory when running ResNet50 v1.5.
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| [`bfloat16_training_full.sh`](bfloat16_training_full.sh) | Runs full bfloat16 training  |
+| [`bfloat16_training_hvd.sh`](bfloat16_training_hvd.sh) | Runs bfloat16 training with Intel® Optimization for Horovod* |
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install TensorFlow and Intel® Extension for TensorFlow (ITEX):
+
+  The Intel® Extension for TensorFlow* requires stock TensorFlow, and the version should be == 2.11.0 or 2.10.0.
+
+  On Linux, it is often necessary to first update pip to a version that supports manylinux2014 wheels.
+  ```bash
+  pip install --upgrade pip
+  ```
+
+  ```bash
+  pip install tensorflow==2.11.0
+  pip install --upgrade intel-extension-for-tensorflow[gpu]
+  ```
+   To verify that TensorFlow and ITEX are correctly installed:
+  ```
+  python -c "import intel_extension_for_tensorflow as itex; print(itex.__version__)"
+  ```
+* Clone the Model Zoo repository:
+  ```bash
+  git clone https://github.com/IntelAI/models.git
+  ```
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading and preprocessing the ImageNet dataset. The path to the ImageNet
+TF records files will need to be set as the `DATASET_DIR` environment variable
+prior to running a [quickstart script](#quick-start-scripts).
+
+### Run the model on Baremetal
+Navigate to the ResNet50 v1.5 training directory, and set environment variables:
+```
+cd models
+export OUTPUT_DIR=<path where output log files will be written>
+export PRECISION=bfloat16
+export DATASET_DIR=<path to the preprocessed imagenet dataset directory>
+
+# Optional envs
+export BATCH_SIZE=<Set batch_size else it will run with default batch>
+
+# Run quickstart script:
+./quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/bfloat16_training_hvd.sh
+
+# Set 'Tile' env variable only for running "bfloat16_training_full.sh" script: 
+export Tile=2
+./quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/bfloat16_training_full.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/bfloat16_training_full.sh b/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/bfloat16_training_full.sh
new file mode 100755
index 000000000..4eab8ca22
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/bfloat16_training_full.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if [ -z "${OUTPUT_DIR}" ]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+
+# Training should have an empty output directory to prevent conflicts with previous weight files
+if [[ "$(ls -A $OUTPUT_DIR)" ]]; then
+  echo "The OUTPUT_DIR provided is not empty. Please provide an empty OUTPUT_DIR for training files."
+  exit 1
+fi
+
+if [ -z "${DATASET_DIR}" ]; then
+  echo "The required environment variable DATASET_DIR has not been set"
+  exit 1
+fi
+
+if [ ! -d "${DATASET_DIR}" ]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+
+# Check for precision
+if [[ $PRECISION != "bfloat16" ]]; then
+  echo "The specified precision '${PRECISION}' is unsupported."
+  echo "Only bfloat16 precision is supported"
+  exit 1
+fi
+
+# If batch size env is not mentioned, then the workload will run with the default batch size.
+if [ -z "${BATCH_SIZE}"]; then
+  BATCH_SIZE="256"
+  export FLAG="--use_bfloat16"
+  echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+export PYTHONPATH=$(pwd)/models/image_recognition/tensorflow/resnet50v1_5/training
+
+if [[ ${Tile} == "1" ]]; then
+    echo "resnet50 v1.5 bf16 training"
+    python ${PYTHONPATH}/mlperf_resnet/imagenet_main.py 2 \
+    --max_train_steps=1000 --train_epochs=1 --epochs_between_evals=1 \
+    --inter_op_parallelism_threads 1 --intra_op_parallelism_threads 28  \
+    --version 1 --resnet_size 50 \
+    --data_dir=${DATASET_DIR} \
+    --model_dir=${OUTPUT_DIR} \
+    --use_synthetic_data --batch_size=$BATCH_SIZE ${FLAG} \
+    --data_format=channels_last 2>&1| tee ${OUTPUT_DIR}//resnet50_bf16_trn_t0_raw.log
+
+elif [[ ${Tile} == "2" ]]; then
+    echo "resnet50 v1.5 bf16 two-tile training"
+    ZE_AFFINITY_MASK=0.0  python ${PYTHONPATH}/mlperf_resnet/imagenet_main.py 2 \
+    --max_train_steps=1000 --train_epochs=1 --epochs_between_evals=1 \
+    --inter_op_parallelism_threads 1 --intra_op_parallelism_threads 28  \
+    --version 1 --resnet_size 50 \
+    --data_dir=${DATASET_DIR} \
+    --model_dir=${OUTPUT_DIR} \
+    --use_synthetic_data --batch_size=$BATCH_SIZE ${FLAG} \
+    --data_format=channels_last 2>&1 | tee ${OUTPUT_DIR}//resnet50_bf16_trn_t0_raw.log &
+    ZE_AFFINITY_MASK=0.1 python ${PYTHONPATH}/mlperf_resnet/imagenet_main.py 2 \
+    --max_train_steps=1000 --train_epochs=1 --epochs_between_evals=1 \
+    --inter_op_parallelism_threads 1 --intra_op_parallelism_threads 28  \
+    --version 1 --resnet_size 50 \
+    --data_dir=${DATASET_DIR} \
+    --model_dir=${OUTPUT_DIR} \
+    --use_synthetic_data --batch_size=$BATCH_SIZE ${FLAG} \
+    --data_format=channels_last  2>&1 | tee ${OUTPUT_DIR}//resnet50_bf16_trn_t1_raw.log
+  else
+    echo "Tiles 1 and 2 are supported."
+    exit 1
+  fi
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/bfloat16_training_hvd.sh b/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/bfloat16_training_hvd.sh
new file mode 100755
index 000000000..83ce39e9b
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/bfloat16_training_hvd.sh
@@ -0,0 +1,68 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if [ -z "${OUTPUT_DIR}" ]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+
+# Training should have an empty output directory to prevent conflicts with previous weight files
+if [[ "$(ls -A $OUTPUT_DIR)" ]]; then
+  echo "The OUTPUT_DIR provided is not empty. Please provide an empty OUTPUT_DIR for training files."
+  exit 1
+fi
+
+if [ -z "${DATASET_DIR}" ]; then
+  echo "The required environment variable DATASET_DIR has not been set"
+  exit 1
+fi
+
+if [ ! -d "${DATASET_DIR}" ]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+
+# Check for precision
+if [[ $PRECISION != "bfloat16" ]]; then
+  echo "The specified precision '${PRECISION}' is unsupported."
+  echo "Only bfloat16 precision is supported"
+  exit 1
+fi
+
+export PYTHONPATH=$(pwd)/models/image_recognition/tensorflow/resnet50v1_5/training
+export NUMBER_OF_PROCESS=2
+export PROCESS_PER_NODE=2
+
+# If batch size env is not mentioned, then the workload will run with the default batch size.
+if [ -z "${BATCH_SIZE}"]; then
+  BATCH_SIZE="256"
+  echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+source "$(dirname $0)/common/utils.sh"
+_command mpirun -np $NUMBER_OF_PROCESS -ppn $PROCESS_PER_NODE --prepend-rank \
+          python $PYTHONPATH/mlperf_resnet/imagenet_main.py 2 \
+          --max_train_steps=100 --train_epochs=1 --epochs_between_evals=1 \
+          --inter_op_parallelism_threads 1 --intra_op_parallelism_threads 28  \
+          --version 1 --resnet_size 50 \
+          --data_dir=${DATASET_DIR} \
+          --model_dir=${OUTPUT_DIR} \
+          --use_synthetic_data --batch_size=$BATCH_SIZE --use_bfloat16 \
+          --data_format=channels_last 2>&1 | tee ${OUTPUT_DIR}/resnet50_${PRECISION}_training_hvd.log
diff --git a/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/build.sh b/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/build.sh
new file mode 100644
index 000000000..5d7be9a22
--- /dev/null
+++ b/quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/build.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE:-intel/intel-extension-for-tensorflow}
+TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG:-gpu-max}
+IMAGE_NAME=${IMAGE_NAME:-intel/image-recognition:tf-max-gpu-resnet50v1-5-training}
+
+docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=tf-max-series-resnet50v1-5-training \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE} \
+    --build-arg TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f tf-max-series-resnet50v1-5-training.Dockerfile .
diff --git a/quickstart/ipex-tool-container/gpu/devcatalog.md b/quickstart/ipex-tool-container/gpu/devcatalog.md
new file mode 100644
index 000000000..16d62954a
--- /dev/null
+++ b/quickstart/ipex-tool-container/gpu/devcatalog.md
@@ -0,0 +1,87 @@
+# Optimizations for Intel® Data Center GPU Flex Series using Intel® Extension for PyTorch*
+
+## Overview
+
+This document has instruction for running Intel® Extension for PyTorch* (IPEX) for
+GPU in container.
+
+## Requirements
+| Item | Detail |
+| ------ | ------- |
+| Host machine  | Intel® Data Center GPU Flex Series  |
+| Drivers | GPU-compatible drivers need to be installed: [Download Driver 476.14](https://dgpu-docs.intel.com/releases/stable_476_14_20221021.html)
+| Software | Docker* Installed |
+
+## Get Started
+
+### Installing the Intel Extensions for PyTorch
+#### Docker pull command:
+
+`docker pull intel/intel-extension-for-pytorch:xpu-flex`
+
+### Running container:
+
+Run following commands to start IPEX GPU tools container. You can use `-v` option to mount your
+local directory into container. The `-v` argument can be omitted if you do not need
+access to a local directory in the container. Pass the video and render groups to your
+docker container so that the GPU is accessible.
+```
+IMAGE_NAME=intel/intel-extension-for-pytorch:xpu-flex
+DOCKER_ARGS=${DOCKER_ARGS:---rm -it}
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run --rm \
+    -v <your-local-dir>:/workspace \
+    --group-add ${VIDEO} \
+    ${RENDER_GROUP} \
+    --device=/dev/dri \
+    --ipc=host \
+    -e http_proxy=$http_proxy \
+    -e https_proxy=$https_proxy \
+    -e no_proxy=$no_proxy \
+    ${DOCKER_ARGS} \
+    ${IMAGE_NAME} \
+    bash
+```
+
+#### Verify if XPU is accessible from PyTorch:
+You are inside container now. Run following command to verify XPU is visible to PyTorch:
+```
+python -c "import torch;print(torch.device('xpu'))"
+```
+Sample output looks like below:
+```
+xpu
+```
+Then, verify that the XPU device is available to IPEX:
+```
+python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())"
+```
+Sample output looks like below:
+```
+True
+```
+Finally, use the following command to check whether MKL is enabled as default:
+```
+python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.has_onemkl())"
+```
+Sample output looks like below:
+```
+True
+```
+
+## Summary and Next Steps
+Now you are inside container with Python 3.9, PyTorch and IPEX preinstalled. You can run your own script
+to run on Intel GPU.
+
+## Documentation and Sources
+
+[GitHub* Repository](https://github.com/intel/intel-extension-for-pytorch/tree/master/docker)
+
+
+## Support
+Support for Intel® Extension for PyTorch* is found via the [Intel® AI Analytics Toolkit.](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html#gs.qbretz) Additionally, the Intel® Extension for PyTorch* team tracks both bugs and enhancement requests using [GitHub issues](https://github.com/intel/intel-extension-for-pytorch/issues). Before submitting a suggestion or bug report, please search the GitHub issues to see if your issue has already been reported.
\ No newline at end of file
diff --git a/quickstart/language_modeling/pytorch/bert_large/inference/gpu/DEVCATALOG.md b/quickstart/language_modeling/pytorch/bert_large/inference/gpu/DEVCATALOG.md
new file mode 100644
index 000000000..bd42eec4c
--- /dev/null
+++ b/quickstart/language_modeling/pytorch/bert_large/inference/gpu/DEVCATALOG.md
@@ -0,0 +1,85 @@
+# BERT Large Inference
+
+## Description
+
+This document has instructions for running BERT Large Inference with FP16 precision using Intel(R) Extension for PyTorch on Intel Max Series GPU. 
+
+## Datasets
+
+### SQuAD dataset
+
+Download the [SQuAD 1.0 dataset](https://github.com/huggingface/transformers/tree/v4.0.0/examples/question-answering#fine-tuning-bert-on-squad10).
+Set the `DATASET_DIR` to point to the directory where the files are located before running the BERT quickstart scripts. Your dataset directory should look something
+like this:
+```
+<DATASET_DIR>/
+├── dev-v1.1.json
+├── evaluate-v1.1.py
+└── train-v1.1.json
+```
+The setup assumes the dataset is downloaded to the current directory. 
+
+## Pre-trained Model
+
+Download the `config.json` and fine tuned model from huggingface and set the `BERT_WEIGHT` environment variable to point to the directory that has both files:
+
+```
+mkdir bert_squad_model
+wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json -O bert_squad_model/config.json
+wget https://cdn.huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin  -O bert_squad_model/pytorch_model.bin
+BERT_WEIGHT=$(pwd)/bert_squad_model
+```
+
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `fp16_inference_plain_format.sh` | Runs BERT Large inference (plain format) for fp16 precision |
+
+Requirements:
+* Host machine has Intel(R) Data Center Max Series GPU
+* Follow instructions to install GPU-compatible driver [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html#ubuntu-22-04)
+* Docker
+
+## Docker pull Command
+
+```
+docker pull intel/language-modeling:pytorch-max-gpu-bert-large-inference
+```
+
+The BERT Large inference container includes scripts,models,libraries needed to run fp16 inference To run the `fp16_inference_plain_format.sh` quick start script follow the instructions below.
+
+```
+export DATASET_DIR=<path to dataset>
+export OUTPUT_DIR=<path to output log files>
+export BERT_WEIGHT=$(pwd)/bert_squad_model
+
+DOCKER_ARGS=${DOCKER_ARGS:---rm -it}
+IMAGE_NAME=intel/language-modeling:pytorch-max-gpu-bert-large-inference
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+SCRIPT=quickstart/fp16_inference_plain_format.sh
+Tile=2
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env BERT_WEIGHT=${BERT_WEIGHT} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env Tile=${Tile} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  --volume ${BERT_WEIGHT}:${BERT_WEIGHT} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  ${DOCKER_ARGS} \
+  $IMAGE_NAME \
+  /bin/bash $SCRIPT
+  ```
diff --git a/quickstart/language_modeling/pytorch/bert_large/inference/gpu/README.md b/quickstart/language_modeling/pytorch/bert_large/inference/gpu/README.md
new file mode 100644
index 000000000..6f67d9e40
--- /dev/null
+++ b/quickstart/language_modeling/pytorch/bert_large/inference/gpu/README.md
@@ -0,0 +1,130 @@
+<!--- 0. Title -->
+# BERT Large inference for Intel(R) Data Center GPU Max Series
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running BERT Large inference using
+Intel-optimized PyTorch with Intel(R) Data Center GPU Max Series.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Max Series, Driver Version: [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+
+## Software Requirements:
+- Intel® Data Center GPU Max Series
+- Intel GPU Drivers: Intel® Data Center GPU Max Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+- Intel® oneAPI Base Toolkit 2023.0
+- Python 3.7-3.10
+- pip 19.0 or later (requires manylinux2014 support)
+
+  |Release|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|
+    |v1.1.0|Intel® Data Center GPU Max Series|  Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/index.html#intel-data-center-gpu-max-series) for latest driver installation. If install the verified Intel® Data Center GPU Max Series/Intel® Data Center GPU Flex Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html), please append the specific version after components.|
+
+- Intel® oneAPI Base Toolkit 2023.0.0: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Threading Building Blocks (oneTBB)
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler, oneTBB and oneMKL.
+    
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19079/l_BaseKit_p_2023.0.0.25537_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, oneTBB and oneMKL
+    $ sudo sh ./l_BaseKit_p_2023.0.0.25537_offline.sh
+    ```
+    For any more details on instructions on how to download and install the base-kit, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux&distributions=offline.
+
+  - Set environment variables
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/compiler/latest/env/vars.sh
+    source {ONEAPI_ROOT}/mkl/latest/env/vars.sh
+    source {ONEAPI_ROOT}/tbb/latest/env/vars.sh
+    ```
+
+<!--- 30. Datasets -->
+## Datasets
+
+### SQuAD dataset
+
+Download the [SQuAD 1.0 dataset](https://github.com/huggingface/transformers/tree/v4.0.0/examples/question-answering#fine-tuning-bert-on-squad10).
+Set the `DATASET_DIR` to point to the directory where the files are located before
+running the BERT quickstart scripts. Your dataset directory should look something
+like this:
+```
+<DATASET_DIR>/
+├── dev-v1.1.json
+├── evaluate-v1.1.py
+└── train-v1.1.json
+```
+The setup assumes the dataset is downloaded to the current directory. 
+
+## Pre-trained Model
+
+Download the `config.json` and fine tuned model from huggingface and set the `BERT_WEIGHT` environment variable to point to the directory that has both files:
+
+```
+mkdir bert_squad_model
+wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-config.json -O bert_squad_model/config.json
+wget https://cdn.huggingface.co/bert-large-uncased-whole-word-masking-finetuned-squad-pytorch_model.bin  -O bert_squad_model/pytorch_model.bin
+BERT_WEIGHT=$(pwd)/bert_squad_model
+```
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| [fp16_inference_plain_format.sh](fp16_inference_plain_format.sh) | Runs BERT large FP16 inference (plain format) using the SQuAD dataset |
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install PyTorch and Intel® Extension for PyTorch for GPU (IPEX):
+  ```bash
+  python -m pip install torch==1.13.0a0 -f https://developer.intel.com/ipex-whl-stable-xpu
+  python -m pip install intel_extension_for_pytorch==1.13.10+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
+  ```
+  To verify that PyTorch and IPEX are correctly installed:
+  ```bash
+  python -c "import torch;print(torch.device('xpu'))"  # Sample output: "xpu"
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())"  #Sample output True
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.has_onemkl())"  # Sample output: True
+  ```
+* Clone the Model Zoo repository:
+  ```bash
+  git clone https://github.com/IntelAI/models.git
+  ```
+* Navigate models directory and install model specific dependencies for the workload:
+  ```bash
+  # Navigate to the model zoo repo
+  cd models
+  # Install model specific dependencies:
+  python -m pip install -r models/language_modeling/pytorch/bert_large/inference/gpu/requirements.txt
+  ```
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading the SQuAD dataset.
+
+```
+# Set environment vars for the dataset and an output directory
+export DATASET_DIR=<path the dataset directory>
+export OUTPUT_DIR=<directory where log files will be written>
+export BERT_WEIGHT=<directory where BERT weight files will be downloaded>
+export Tile=2
+
+# Run a quickstart script
+./quickstart/language_modeling/pytorch/bert_large/inference/gpu/fp16_inference_plain_format.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/language_modeling/pytorch/bert_large/inference/gpu/build.sh b/quickstart/language_modeling/pytorch/bert_large/inference/gpu/build.sh
new file mode 100755
index 000000000..5242b699e
--- /dev/null
+++ b/quickstart/language_modeling/pytorch/bert_large/inference/gpu/build.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE:-intel/intel-extension-for-pytorch}
+PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG:-xpu-max}
+IMAGE_NAME=${IMAGE_NAME:-intel/language-modeling:pytorch-max-gpu-bert-large-inference}
+
+docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=pytorch-max-series-bert-large-inference \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE} \
+    --build-arg PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f pytorch-max-series-bert-large-inference.Dockerfile .
diff --git a/quickstart/language_modeling/pytorch/bert_large/inference/gpu/fp16_inference_plain_format.sh b/quickstart/language_modeling/pytorch/bert_large/inference/gpu/fp16_inference_plain_format.sh
new file mode 100755
index 000000000..196f03b57
--- /dev/null
+++ b/quickstart/language_modeling/pytorch/bert_large/inference/gpu/fp16_inference_plain_format.sh
@@ -0,0 +1,110 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+BATCH_SIZE=${BATCH_SIZE-64}
+
+#source ${MODEL_DIR}/quickstart/setvars.sh
+
+if [[ -z "${Tile}" ]]; then
+    Tile=${Tile-1}
+else
+    Tile=${Tile}
+fi
+
+if [[ -z "${DATASET_DIR}" ]]; then
+  echo "The required environment variable DATASET_DIR has not been set"
+  exit 1
+fi
+
+if [[ ! -d "${DATASET_DIR}" ]]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+
+if [[ -z $OUTPUT_DIR ]]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+# Create the output directory, if it doesn't already exist
+mkdir -p $OUTPUT_DIR
+
+bertsquad_log_analysis() {
+    # $1 : src raw log
+    # $2 : dst format log
+    # $3 : inference or training
+    # $4 : bs
+
+    if [ -f $2 ]; then
+        rm $2
+    fi
+
+    bs=$4
+
+    if [ "inference" == "$3" ]; then
+        echo -e 'Batch Size: ' $bs >$2
+        cat $1 | grep latency | tail -n6 | head -n4 |
+            awk -v bs=${bs} -F ' ' '{sum+=$8} END{printf "Performance Benchmark Time: %.3f sec, Throughput: %.2f seq/sec\n", sum/4, bs*4/sum}' >>$2
+        grep "\"f1\": " $1 | awk -F ' ' '{printf "Accuracy: f1 %.4f\n", $NF}' >>$2
+    elif [ "training" == "$3" ]; then
+        # only for fine tune (accuracy only)
+        echo -e 'Batch Size: ' $bs >$2
+        echo -e 'Performance Benchmark Time: N/A' >>$2
+        grep "\"f1\": " $1 | awk -F ' ' '{printf "Accuracy: f1 %.4f\n", $NF}' >>$2
+    else
+        echo -e 'Invalid input! Only inference or training are supported.'
+        exit 0
+    fi
+}
+
+if [[ ${Tile} == "1" ]]; then
+  echo "bertsquad fp16 inference plain nchw"
+  cd ${MODEL_DIR}/models/language_modeling/pytorch/bert_large/inference/gpu/
+  bash cmd_infer.sh \
+      -m bert_large \
+      -d xpu \
+      -b $BATCH_SIZE \
+      -t FP16 \
+      -o None 2>&1 | tee ${OUTPUT_DIR}/bertsquad_fp16_inf_plain_nchw_t0_raw.log
+  wait
+  bertsquad_log_analysis ${OUTPUT_DIR}/bertsquad_fp16_inf_plain_nchw_t0_raw.log ${OUTPUT_DIR}/bertsquad_fp16_inf_plain_nchw_t0.log inference ${BATCH_SIZE}
+  cd -
+elif [[ ${Tile} == "2" ]]; then
+  echo "bertsquad fp16 inference plain nchw 2 tile"
+  cd ${MODEL_DIR}/models/language_modeling/pytorch/bert_large/inference/gpu/
+  ZE_AFFINITY_MASK=0.0 bash cmd_infer.sh \
+      -m bert_large \
+      -d xpu \
+      -b $BATCH_SIZE \
+      -t FP16 \
+      -o None 2>&1 | tee ${OUTPUT_DIR}/bertsquad_fp16_inf_plain_nchw_t0_raw.log &
+  ZE_AFFINITY_MASK=0.1 bash cmd_infer.sh \
+      -m bert_large \
+      -d xpu \
+      -b $BATCH_SIZE \
+      -t FP16 \
+      -o None 2>&1 | tee ${OUTPUT_DIR}/bertsquad_fp16_inf_plain_nchw_t1_raw.log
+  wait
+  bertsquad_log_analysis ${OUTPUT_DIR}/bertsquad_fp16_inf_plain_nchw_t0_raw.log ${OUTPUT_DIR}/bertsquad_fp16_inf_plain_nchw_t0.log inference ${BATCH_SIZE}
+  bertsquad_log_analysis ${OUTPUT_DIR}/bertsquad_fp16_inf_plain_nchw_t1_raw.log ${OUTPUT_DIR}/bertsquad_fp16_inf_plain_nchw_t1.log inference ${BATCH_SIZE}
+  cd -
+else
+    echo "The specified Tile '${Tile}' is unsupported."
+    echo "Supported tile number are: 1 and 2"
+    exit 1
+fi
diff --git a/quickstart/language_modeling/pytorch/bert_large/training/gpu/DEVCATALOG.md b/quickstart/language_modeling/pytorch/bert_large/training/gpu/DEVCATALOG.md
new file mode 100644
index 000000000..a5941369a
--- /dev/null
+++ b/quickstart/language_modeling/pytorch/bert_large/training/gpu/DEVCATALOG.md
@@ -0,0 +1,71 @@
+# BERT Large Training 
+
+## Description
+This document has instructions for running BERT Large training with BF16 precision using Intel(R) Extension for PyTorch on Intel Max Series GPU. 
+
+## Datasets
+### Download and Extract the Dataset
+Download the [MLCommons BERT Dataset](https://drive.google.com/drive/folders/1cywmDnAsrP5-2vsr8GDc6QUc7VWe-M3v) and download the `results_text.tar.gz` file. Extract the file. After this step, you should have a directory called `results4` that contains 502 files with a total size of 13GB. Set the `DATASET_DIR` to point the location of this dataset. The script assumes the `DATASET_DIR` to be the current working directory. 
+
+The above step is optional. If the `results4` folder is not present in the `DATASET_DIR` path, the quick start scripts automatically download it. 
+
+### Generate the BERT Input Dataset 
+The Training script processes the raw dataset. The processed dataset occupies about `539GB` worth of disk space. Additionally, this step can take several hours to complete to generate a folder `hdf5_seq_512`. Hence, the script provides the ability to process the data only once and this data can be volume mounted to the container for future use. Set the `PROCESSED_DATASET_DIR` to point to the location of `hdf5_seq_512`. 
+
+The script assumes the `PROCESSED_DATASET_DIR` to be the current working directory. If the processed folder `hdf5_seq_512` does not exist in the `PROCESSED_DATASET_DIR` path, the quick start scripts process the data.
+
+## Quick Start Scripts
+| Script name | Description |
+|-------------|-------------|
+| `bf16_training_plain_format.sh` | Runs BERT Large BF16 training (plain format) on two tiles |
+| `ddp_bf16_training_plain_format.sh` | Runs BERT Large Distributed Data Parallel BF16 Training on two tiles | 
+
+## Docker
+Requirements:
+* Host machine has Intel(R) Data Center Max Series GPU
+* Follow instructions to install GPU-compatible driver [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html#ubuntu-22-04)
+* Docker
+
+## Docker pull Command
+```
+docker pull intel/language-modeling:pytorch-max-gpu-bert-large-training
+```
+
+The BERT Large training container includes scripts,models,libraries needed to run BF16 training. To run the `ddp_bf16_training_plain_format.sh` quick start script follow the instructions below.
+```
+export OUTPUT_DIR=${PWD}/logs 
+export DATASET_DIR=${PWD}
+export PROCESSED_DATASET_DIR=${PWD}
+
+DOCKER_ARGS="--rm --init -it"
+IMAGE_NAME=intel/language-modeling:pytorch-max-gpu-bert-large-training
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+SCRIPT=quickstart/ddp_bf16_training_plain_format.sh
+Tile=2
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --shm-size=10G \
+  --privileged \
+  --ipc host \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env PROCESSED_DATASET_DIR=${PROCESSED_DATASET_DIR} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env Tile=${Tile} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  --volume ${PROCESSED_DATASET_DIR}:${PROCESSED_DATASET_DIR} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume /dev/dri:/dev/dri/ \
+  ${DOCKER_ARGS} \
+  $IMAGE_NAME \
+  /bin/bash $SCRIPT
+  ```
\ No newline at end of file
diff --git a/quickstart/language_modeling/pytorch/bert_large/training/gpu/README.md b/quickstart/language_modeling/pytorch/bert_large/training/gpu/README.md
new file mode 100644
index 000000000..5c17db337
--- /dev/null
+++ b/quickstart/language_modeling/pytorch/bert_large/training/gpu/README.md
@@ -0,0 +1,123 @@
+<!--- 0. Title -->
+# BERT Large training for Intel(R) Data Center GPU Max Series
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running BERT Large inference using
+Intel-optimized PyTorch with Intel(R) Data Center GPU Max Series.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Max Series, Driver Version: [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+
+## Software Requirements:
+- Intel® Data Center GPU Max Series
+- Intel GPU Drivers: Intel® Data Center GPU Max Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+- Intel® oneAPI Base Toolkit 2023.0
+- Python 3.7-3.10
+- pip 19.0 or later (requires manylinux2014 support)
+
+  |Release|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|
+    |v1.1.0|Intel® Data Center GPU Max Series|  Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/index.html#intel-data-center-gpu-max-series) for latest driver installation. If install the verified Intel® Data Center GPU Max Series/Intel® Data Center GPU Flex Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html), please append the specific version after components.|
+
+- Intel® oneAPI Base Toolkit 2023.0.0: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Threading Building Blocks (oneTBB)
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  - Intel® oneAPI Collective Communications Library (oneCCL), required by Intel® Optimization for Horovod* only
+  * Download and install the verified DPC++ compiler, oneTBB and oneMKL.
+    
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19079/l_BaseKit_p_2023.0.0.25537_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, oneTBB and oneMKL
+    # if you want to run distributed training with Intel® Optimization for Horovod*, oneCCL is needed too(Intel® oneAPI MPI Library will be installed automatically as its dependency)
+    $ sudo sh ./l_BaseKit_p_2023.0.0.25537_offline.sh
+    ```
+    For any more details on instructions on how to download and install the base-kit, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux&distributions=offline.
+
+  - Set environment variables
+    Default installation location `{ONEAPI_ROOT}` is `/opt/intel/oneapi` for root account,`${HOME}/intel/oneapi` for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/compiler/latest/env/vars.sh
+    source {ONEAPI_ROOT}/mkl/latest/env/vars.sh
+    source {ONEAPI_ROOT}/tbb/latest/env/vars.sh
+
+    # oneCCL (and Intel® oneAPI MPI Library as its dependency), required by Intel® Optimization for Horovod* only
+    source {ONEAPI_ROOT}/mpi/latest/env/vars.sh
+    source {ONEAPI_ROOT}/ccl/latest/env/vars.sh
+    ```
+
+<!--- 30. Datasets -->
+## Datasets
+### Download and Extract the Dataset
+Download the [MLCommons BERT Dataset](https://drive.google.com/drive/folders/1cywmDnAsrP5-2vsr8GDc6QUc7VWe-M3v) and download the `results_text.tar.gz` file. Extract the file. After this step, you should have a directory called `results4` that contains 502 files with a total size of 13GB. Set the `DATASET_DIR` to point the location of this dataset. The script assumes the `DATASET_DIR` to be the current working directory. 
+
+The above step is optional. If the `results4` folder is not present in the `DATASET_DIR` path, the quick start scripts automatically download it. 
+
+### Generate the BERT Input Dataset 
+The Training script processes the raw dataset. The processed dataset occupies about `539GB` worth of disk space. Additionally, this step can take several hours to complete to generate a folder `hdf5_seq_512`. Hence, the script provides the ability to process the data only once and this data can be volume mounted to the container for future use. Set the `PROCESSED_DATASET_DIR` to point to the location of `hdf5_seq_512`. 
+
+The script assumes the `PROCESSED_DATASET_DIR` to be the current working directory. If the processed folder `hdf5_seq_512` does not exist in the `PROCESSED_DATASET_DIR` path, the quick start scripts process the data.
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+| Script name | Description |
+|-------------|-------------|
+| `bf16_training_plain_format.sh` | Runs BERT Large BF16 training (plain format) on two tiles |
+| `ddp_bf16_training_plain_format.sh` | Runs BERT Large Distributed Data Parallel BF16 Training on two tiles | 
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install PyTorch and Intel® Extension for PyTorch for GPU (IPEX):
+  ```bash
+  python -m pip install torch==1.13.0a0 -f https://developer.intel.com/ipex-whl-stable-xpu
+  python -m pip install intel_extension_for_pytorch==1.13.10+xpu -f https://developer.intel.com/ipex-whl-stable-xpu
+
+  # To run `ddp_bf16_training_plain_format.sh` oneccl_bind_pt is also needed:
+  python -m pip install oneccl_bind_pt==1.13.100+gpu -f https://developer.intel.com/ipex-whl-stable-xpu
+  ```
+  To verify that PyTorch and IPEX are correctly installed:
+  ```bash
+  python -c "import torch;print(torch.device('xpu'))"  # Sample output: "xpu"
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())"  #Sample output True
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.has_onemkl())"  # Sample output: True
+  ```
+* Clone the Model Zoo repository:
+  ```bash
+  git clone https://github.com/IntelAI/models.git
+  ```
+* Navigate models directory and install model specific dependencies for the workload:
+  ```bash
+  # Navigate to the model zoo repo
+  cd models
+  # Install model specific dependencies:
+  ./quickstart/language_modeling/pytorch/bert_large/inference/gpu/setup.sh
+  ```
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading the dataset.
+
+```
+# Set environment vars for the dataset and output path
+export DATASET_DIR=<path the `results4` directory>
+export OUTPUT_DIR=<directory where log files will be written>
+export PROCESSED_DATASET_DIR=<path to `hdf5_seq_512` directory>
+export Tile=2
+
+# Run a quickstart script
+./quickstart/language_modeling/pytorch/bert_large/training/gpu/<script_name>.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/language_modeling/pytorch/bert_large/training/gpu/bf16_training_plain_format.sh b/quickstart/language_modeling/pytorch/bert_large/training/gpu/bf16_training_plain_format.sh
new file mode 100644
index 000000000..d3aa40f0d
--- /dev/null
+++ b/quickstart/language_modeling/pytorch/bert_large/training/gpu/bf16_training_plain_format.sh
@@ -0,0 +1,128 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+
+BATCH_SIZE=${BATCH_SIZE-16}
+
+
+if [[ -z "${Tile}" ]]; then
+    Tile=${Tile-1}
+else
+    Tile=${Tile}
+fi
+
+if [[ ! -d "${DATASET_DIR}" ]]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+
+if [[ -z $OUTPUT_DIR ]]; then
+  echo "The required environment variable OUTPUT_DIR has not been set" >&2
+  exit 1
+fi
+
+# Create the output directory, if it doesn't already exist
+mkdir -p $OUTPUT_DIR
+
+
+bert_log_analysis() {
+    # $1 : src raw log
+    # $2 : dst format log
+    # $3 : inference or training
+    # $4 : bs
+
+    if [ -f $2 ]; then
+        rm $2
+    fi
+
+    bs=$4
+    if [ "training" == "$3" ]; then
+        echo -e 'Batch Size: ' $bs >$2
+        grep "train perf: " $1 | tail -n1 | awk -v bs=${bs} -F ' ' '{printf "Performance Benchmark Time: %.3f sec, Throughput: %.2f seq/sec\n", $3, bs/$3}' >>$2
+        grep "perplexity = " $1 | awk -F ' ' '{printf "Accuracy: perplexity %.6f\n", $NF}' >>$2
+    else
+        echo -e 'Invalid input! Only training are supported.'
+        exit 0
+    fi
+}
+
+if [[ ! -d ${PROCESSED_DATASET_DIR}/hdf5_seq_512 ]]; then
+  if [[ ! -d ${DATASET_DIR}/results4 ]]; then
+    gdown https://drive.google.com/uc?id=14xV2OUGSQDG_yDBrmbSdcDC-QGeqpfs_
+    tar -xf results_text.tar.gz
+    chmod 775 results4
+    mv results4 ${DATASET_DIR}
+  fi
+  cd ${MODEL_DIR}/models/language_modeling/pytorch/bert_large/training/gpu/data/
+  bash parallel_create_hdf5.sh
+  cd -
+fi
+
+
+if [[ ${Tile} == "1" ]]; then
+  echo "bert bf16 training plain nchw"
+  cd ${MODEL_DIR}/models/language_modeling/pytorch/bert_large/training/gpu/
+  python run_pretrain_mlperf.py \
+      --config_name=bert_config.json \
+      --input_dir=${PROCESSED_DATASET_DIR}/hdf5_seq_512 \
+      --output_dir=result \
+      --eval_dir=${PROCESSED_DATASET_DIR}/hdf5_seq_512 \
+      --device=xpu \
+      --do_train \
+      --train_batch_size=${BATCH_SIZE} \
+      --gradient_accumulation_steps=1 \
+      --bf16 \
+      --adamw --num-iterations 10 2>&1 | tee ${OUTPUT_DIR}/bert_bf16_train_plain_nchw_t0_raw.log
+  wait
+  bert_log_analysis ${OUTPUT_DIR}/bert_bf16_train_plain_nchw_t0_raw.log ${OUTPUT_DIR}/bert_bf16_train_plain_nchw_t0.log training ${BATCH_SIZE}
+  cd -
+elif [[ ${Tile} == "2" ]]; then
+  echo "bert bf16 training plain nchw 2 tile"
+  cd ${MODEL_DIR}/models/language_modeling/pytorch/bert_large/training/gpu/
+  ZE_AFFINITY_MASK=0.0 python run_pretrain_mlperf.py \
+      --config_name=bert_config.json \
+      --input_dir=${PROCESSED_DATASET_DIR}/hdf5_seq_512 \
+      --output_dir=result \
+      --eval_dir=${PROCESSED_DATASET_DIR}/hdf5_seq_512 \
+      --device=xpu \
+      --do_train \
+      --train_batch_size=${BATCH_SIZE}  \
+      --gradient_accumulation_steps=1 \
+      --bf16 \
+      --adamw --num-iterations 10 2>&1 | tee ${OUTPUT_DIR}/bert_bf16_train_plain_nchw_t0_raw.log &
+  ZE_AFFINITY_MASK=0.1 python run_pretrain_mlperf.py \
+      --config_name=bert_config.json \
+      --input_dir=${PROCESSED_DATASET_DIR}/hdf5_seq_512\
+      --output_dir=result \
+      --eval_dir=${PROCESSED_DATASET_DIR}/hdf5_seq_512 \
+      --device=xpu \
+      --do_train \
+      --train_batch_size=${BATCH_SIZE}  \
+      --gradient_accumulation_steps=1 \
+      --bf16 \
+      --adamw --num-iterations 10  2>&1 | tee ${OUTPUT_DIR}/bert_bf16_train_plain_nchw_t1_raw.log
+  wait
+  bert_log_analysis ${OUTPUT_DIR}/bert_bf16_train_plain_nchw_t0_raw.log ${OUTPUT_DIR}/bert_bf16_train_plain_nchw_t0.log training ${BATCH_SIZE}
+  bert_log_analysis ${OUTPUT_DIR}/bert_bf16_train_plain_nchw_t1_raw.log ${OUTPUT_DIR}/bert_bf16_train_plain_nchw_t1.log training ${BATCH_SIZE}
+  cd -
+else
+    echo "The specified Tile '${Tile}' is unsupported."
+    echo "Supported tile number are: 1 and 2"
+    exit 1
+fi
diff --git a/quickstart/language_modeling/pytorch/bert_large/training/gpu/build.sh b/quickstart/language_modeling/pytorch/bert_large/training/gpu/build.sh
new file mode 100755
index 000000000..3444a54a8
--- /dev/null
+++ b/quickstart/language_modeling/pytorch/bert_large/training/gpu/build.sh
@@ -0,0 +1,34 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE:-intel/intel-extension-for-pytorch}
+PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG:-xpu-max}
+IMAGE_NAME=${IMAGE_NAME:-intel/language-modeling:pytorch-max-gpu-bert-large-training}
+
+docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=pytorch-max-series-bert-large-training \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE} \
+    --build-arg PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f pytorch-max-series-bert-large-training.Dockerfile .
diff --git a/quickstart/language_modeling/pytorch/bert_large/training/gpu/ddp_bf16_training_plain_format.sh b/quickstart/language_modeling/pytorch/bert_large/training/gpu/ddp_bf16_training_plain_format.sh
new file mode 100644
index 000000000..fa364f17b
--- /dev/null
+++ b/quickstart/language_modeling/pytorch/bert_large/training/gpu/ddp_bf16_training_plain_format.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022-2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+BATCH_SIZE=${BATCH_SIZE-16}
+
+if [[ ! -d "${DATASET_DIR}" ]]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+
+if [[ -z $OUTPUT_DIR ]]; then
+  echo "The required environment variable OUTPUT_DIR has not been set" >&2
+  exit 1
+fi
+
+# Create the output directory, if it doesn't already exist
+mkdir -p $OUTPUT_DIR
+
+export LD_PRELOAD=/opt/intel/oneapi/lib/intel64/libmpi.so
+
+bert_log_analysis() {
+    # $1 : src raw log
+    # $2 : dst format log
+    # $3 : inference or training
+    # $4 : bs
+
+    if [ -f $2 ]; then
+        rm $2
+    fi
+
+    bs=$4
+    if [ "training" == "$3" ]; then
+        echo -e 'Batch Size: ' $bs >$2
+        grep "train perf: " $1 | tail -n1 | awk -v bs=${bs} -F ' ' '{printf "Performance Benchmark Time: %.3f sec, Throughput: %.2f seq/sec\n", $3, bs/$3}' >>$2
+        grep "perplexity = " $1 | awk -F ' ' '{printf "Accuracy: perplexity %.6f\n", $NF}' >>$2
+    else
+        echo -e 'Invalid input! Only training are supported.'
+        exit 0
+    fi
+}
+
+if [[ ! -d ${PROCESSED_DATASET_DIR}/hdf5_seq_512 ]]; then
+  if [[ ! -d ${DATASET_DIR}/results4 ]]; then
+    gdown https://drive.google.com/uc?id=14xV2OUGSQDG_yDBrmbSdcDC-QGeqpfs_
+    tar -xf results_text.tar.gz
+    chmod 775 results4
+    mv results4 ${DATASET_DIR}
+  fi
+  cd ${MODEL_DIR}/models/language_modeling/pytorch/bert_large/training/gpu/data/
+  bash parallel_create_hdf5.sh
+  cd -
+fi
+
+echo "explicit scaling ddp bert bf16 training plain nchw 1c2t"
+cd ${MODEL_DIR}/models/language_modeling/pytorch/bert_large/training/gpu/
+I_MPI_DEBUG=6 mpiexec -np 2 -ppn 2 python run_pretrain_mlperf.py \
+    --config_name=bert_config.json \
+    --input_dir=${PROCESSED_DATASET_DIR}/hdf5_seq_512 \
+    --output_dir=result \
+    --eval_dir=${PROCESSED_DATASET_DIR}/hdf5_seq_512 \
+    --device=xpu \
+    --do_train \
+    --train_batch_size=${BATCH_SIZE} \
+    --gradient_accumulation_steps=1 \
+    --bf16 \
+    --adamw --num-iterations 10 2>&1 | tee ${OUTPUT_DIR}/ddp-bert_bf16_train_plain_nchw_1c2t_raw.log
+wait
+bert_log_analysis ${OUTPUT_DIR}/ddp-bert_bf16_train_plain_nchw_1c2t_raw.log ${OUTPUT_DIR}/ddp-bert_bf16_train_plain_nchw_1c2t.log training ${BATCH_SIZE}
+cd -
diff --git a/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/DEVCATALOG.md b/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/DEVCATALOG.md
new file mode 100644
index 000000000..4814c7ef9
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/DEVCATALOG.md
@@ -0,0 +1,81 @@
+# TensorFlow BERT Large Inference
+
+## Description
+This document has instructions for running BERT Large inference with FP16 and FP32 precision using Intel® Data Center GPU Max Series. 
+
+## Datasets 
+
+### BERT Large Data
+Download and unzip the BERT Large uncased (whole word masking) model from the [google bert repo](https://github.com/google-research/bert#pre-trained-models).
+Then, download the Stanford Question Answering Dataset (SQuAD) dataset file `dev-v1.1.json` into the `wwm_uncased_L-24_H-1024_A-16` directory that was just unzipped.
+
+```
+wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
+unzip wwm_uncased_L-24_H-1024_A-16.zip
+
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -P wwm_uncased_L-24_H-1024_A-16
+```
+Set the `PRETRAINED_DIR` to point to that directory when running BERT Large inference using the SQuAD data.
+
+Download the SQUAD directory and set the `SQUAD_DIR` environment variable to point where it was saved:
+  ```
+  wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
+  wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
+  wget https://raw.githubusercontent.com/allenai/bi-att-flow/master/squad/evaluate-v1.1.py
+  ```
+
+## Quick Start Scripts
+| Script name | Description |
+|-------------|-------------|
+| `benchmark.sh` | This script runs bert large fp16 and fp32 inference. |
+
+## Docker
+Requirements:
+* Host machine has Intel(R) Data Center Max Series GPU
+* Follow instructions to install GPU-compatible driver [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html#ubuntu-22-04)
+* Docker
+
+## Docker pull Command
+```
+docker pull intel/language-modeling:tf-max-gpu-bert-large-inference
+```
+The BERT Large Inference container includes scripts,models,libraries needed to run fp16/fp32 Inference. 
+
+```
+export PRECISION=fp16
+export OUTPUT_DIR=<path to output logs>
+export PRETRAINED_DIR=<path to dataset>
+export SQAUD_DIR=<path to squad directory>
+export Tile=2
+
+IMAGE_NAME=intel/language-modeling:tf-max-gpu-bert-large-inference
+DOCKER_ARGS="--rm -it"
+SCRIPT=benchmark.sh
+
+FROZEN_GRAPH=/workspace/tf-max-series-bert-large-inference/frozen_graph/fp32_bert_squad.pb
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --env PRECISION=${PRECISION} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env FROZEN_GRAPH=${FROZEN_GRAPH} \
+  --env PRETRAINED_DIR=${PRETRAINED_DIR} \
+  --env SQUAD_DIR=${SQUAD_DIR} \
+  --env Tile=${Tile} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume ${PRETRAINED_DIR}:${PRETRAINED_DIR} \
+  --volume ${SQUAD_DIR}:${SQUAD_DIR} \
+  ${DOCKER_ARGS}\
+  $IMAGE_NAME \
+  /bin/bash quickstart/$SCRIPT
+  ```
\ No newline at end of file
diff --git a/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/README.md b/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/README.md
new file mode 100644
index 000000000..1121bbe0c
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/README.md
@@ -0,0 +1,140 @@
+<!--- 0. Title -->
+# BERT Large inference
+
+<!-- 10. Description -->
+
+This document has instructions for running BERT Large inference using
+Intel-optimized TensorFlow with Intel® Data Center GPU Max Series.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Max Series, Driver Version: [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+
+## Software Requirements:
+- Intel® Data Center GPU Max Series
+- Intel GPU Drivers: Intel® Data Center GPU Max Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+- Intel® oneAPI Base Toolkit 2023.0
+- TensorFlow 2.11.0 or 2.10.0
+- Python 3.7-3.10
+- pip 19.0 or later (requires manylinux2014 support)
+
+  |Release|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|
+    |v1.1.0|Intel® Data Center GPU Max Series|  Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/index.html#intel-data-center-gpu-max-series) for latest driver installation. If install the verified Intel® Data Center GPU Max Series/Intel® Data Center GPU Flex Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html), please append the specific version after components.|
+
+- Intel® oneAPI Base Toolkit 2023.0.0: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Threading Building Blocks (oneTBB)
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler, oneTBB and oneMKL.
+    
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19079/l_BaseKit_p_2023.0.0.25537_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, oneTBB and oneMKL
+    $ sudo sh ./l_BaseKit_p_2023.0.0.25537_offline.sh
+    ```
+    For any more details on instructions on how to download and install the base-kit, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux&distributions=offline.
+
+  - Set environment variables
+    Default installation location `{ONEAPI_ROOT}` is `/opt/intel/oneapi` for root account, `${HOME}/intel/oneapi` for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/compiler/latest/env/vars.sh
+    source {ONEAPI_ROOT}/mkl/latest/env/vars.sh
+    source {ONEAPI_ROOT}/tbb/latest/env/vars.sh
+    ```
+
+<!--- 30. Datasets -->
+## Datasets
+
+### BERT Large Data
+Download and unzip the BERT Large uncased (whole word masking) model from the
+[google bert repo](https://github.com/google-research/bert#pre-trained-models).
+Then, download the Stanford Question Answering Dataset (SQuAD) dataset file `dev-v1.1.json` into the `wwm_uncased_L-24_H-1024_A-16` directory that was just unzipped.
+
+```
+wget https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
+unzip wwm_uncased_L-24_H-1024_A-16.zip
+
+wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json -P wwm_uncased_L-24_H-1024_A-16
+```
+Set the `DATASET_DIR` to point to that directory when running BERT Large inference using the SQuAD data.
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| [`benchmark.sh`](benchmark.sh) | This script runs bert large fp16 and fp32 inference. |
+| [`accuracy.sh`](accuracy.sh) | This script runs bert large fp16 and fp32 inference in accuracy mode. |
+
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install TensorFlow and Intel® Extension for TensorFlow (ITEX):
+
+  The Intel® Extension for TensorFlow* requires stock TensorFlow, and the version should be == 2.11.0 or 2.10.0.
+
+  On Linux, it is often necessary to first update pip to a version that supports manylinux2014 wheels.
+  ```bash
+  pip install --upgrade pip
+  ```
+  
+  ```bash
+  pip install tensorflow==2.11.0
+  pip install --upgrade intel-extension-for-tensorflow[gpu]
+  ```
+   To verify that TensorFlow and ITEX are correctly installed:
+  ```
+  python -c "import intel_extension_for_tensorflow as itex; print(itex.__version__)"
+  ```
+  
+* Download the frozen graph model file, and set the FROZEN_GRAPH environment variable to point to where it was saved:
+  ```bash
+  wget https://storage.googleapis.com/intel-optimized-tensorflow/models/v2_7_0/fp32_bert_squad.pb
+  ```
+
+* Download the pretrained model directory and set the PRETRAINED_DIR environment variable to point where it was saved:
+  ```bash
+  wget  https://storage.googleapis.com/bert_models/2019_05_30/wwm_uncased_L-24_H-1024_A-16.zip
+  unzip wwm_uncased_L-24_H-1024_A-16.zip
+  ```
+
+* Download the SQUAD directory and set the SQUAD_DIR environment variable to point where it was saved:
+  ```bash
+  wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
+  wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
+  wget https://raw.githubusercontent.com/allenai/bi-att-flow/master/squad/evaluate-v1.1.py
+  ```
+* Clone the Model Zoo repository:
+  ```bash
+  git clone https://github.com/IntelAI/models.git
+  ```
+
+### Run the model on Baremetal
+Navigate to the BERT Large inference directory, and set environment variables:
+```
+cd models
+export OUTPUT_DIR=<path where output log files will be written>
+export PRECISION=<Set precision: fp16 or fp32>
+export FROZEN_GRAPH=<path to pretrained model file (*.pb)>
+export PRETRAINED_DIR=<path to pretrained directory>
+export SQUAD_DIR=<path to squad directory>
+
+# Set `Tile` env variable only for running `benchmark.sh` script:
+export Tile=2
+
+# Run quickstart script:
+./quickstart/language_modeling/tensorflow/bert_large/inference/gpu/<script name>.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/accuracy.sh b/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/accuracy.sh
new file mode 100755
index 000000000..61c9417e9
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/accuracy.sh
@@ -0,0 +1,85 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'PRECISION='$PRECISION
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo 'PRETRAINED_DIR='$PRETRAINED_DIR
+echo 'SQUAD_DIR='$SQUAD_DIR
+echo 'FROZEN_GRAPH='$FROZEN_GRAPH
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_envs
+input_envs[PRECISION]=${PRECISION}
+input_envs[OUTPUT_DIR]=${OUTPUT_DIR}
+input_envs[PRETRAINED_DIR]=${PRETRAINED_DIR}
+input_envs[SQUAD_DIR]=${SQUAD_DIR}
+input_envs[FROZEN_GRAPH]=${FROZEN_GRAPH}
+
+for i in "${!input_envs[@]}"; do
+  var_name=$i
+  env_param=${input_envs[$i]}
+ 
+  if [[ -z $env_param ]]; then
+    echo "The required environment variable $var_name is not set" >&2
+    exit 1
+  fi
+done
+
+# Check for precision
+if [[ ${PRECISION} == "fp16" || ${PRECISION} == "fp32" ]]; then
+  echo "The specified precision '${PRECISION}' is supported."
+else
+  echo "The specified precision '${PRECISION}' is not supported. Only fp16 and fp32 precision is supported"
+  exit 1
+fi
+
+lspci_display_info=$(lspci | grep -i display)
+
+if [ -z "${BATCH_SIZE}" ]; then
+      BATCH_SIZE="64"
+      echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+export CreateMultipleSubDevices=1
+export TF_NUM_INTEROP_THREADS=1
+
+if [[ $PRECISION == "fp16" ]]; then
+  export ITEX_AUTO_MIXED_PRECISION=1
+  export ITEX_AUTO_MIXED_PRECISION_DATA_TYPE="FLOAT16"
+fi
+
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+source "quickstart/common/utils.sh"
+_command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+    --model-name=bert_large \
+    --precision=${PRECISION} \
+    --mode=inference \
+    --in-graph=${FROZEN_GRAPH} \
+    --framework=tensorflow \
+    --batch-size=${BATCH_SIZE} \
+    --vocab-file=${PRETRAINED_DIR}/vocab.txt \
+    --config-file=${PRETRAINED_DIR}/bert_config.json \
+    --predict-file=${SQUAD_DIR}/dev-v1.1.json \
+    --output-dir ${OUTPUT_DIR} \
+    --accuracy-only \
+    --gpu \
+    $@ \
+    -- infer_option=SQuAD
diff --git a/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/benchmark.sh b/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/benchmark.sh
new file mode 100755
index 000000000..4b7eb2214
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/benchmark.sh
@@ -0,0 +1,125 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2020 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'PRECISION='$PRECISION
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo 'PRETRAINED_DIR='$PRETRAINED_DIR
+echo 'SQUAD_DIR='$SQUAD_DIR
+echo 'FROZEN_GRAPH='$FROZEN_GRAPH
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_envs
+input_envs[PRECISION]=${PRECISION}
+input_envs[OUTPUT_DIR]=${OUTPUT_DIR}
+input_envs[PRETRAINED_DIR]=${PRETRAINED_DIR}
+input_envs[SQUAD_DIR]=${SQUAD_DIR}
+input_envs[FROZEN_GRAPH]=${FROZEN_GRAPH}
+
+for i in "${!input_envs[@]}"; do
+  var_name=$i
+  env_param=${input_envs[$i]}
+ 
+  if [[ -z $env_param ]]; then
+    echo "The required environment variable $var_name is not set" >&2
+    exit 1
+  fi
+done
+
+# Check for precision
+if [[ ${PRECISION} == "fp16" || ${PRECISION} == "fp32" ]]; then
+  echo "The specified precision '${PRECISION}' is supported."
+else
+  echo "The specified precision '${PRECISION}' is not supported. Only fp16 and fp32 precision is supported"
+  exit 1
+fi
+
+if [ -z "${BATCH_SIZE}" ]; then
+      BATCH_SIZE="64"
+      echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+if [[ $PRECISION == "fp16" ]]; then
+  export ITEX_AUTO_MIXED_PRECISION=1
+  export ITEX_AUTO_MIXED_PRECISION_DATA_TYPE="FLOAT16"
+fi
+
+export CreateMultipleSubDevices=1
+export TF_NUM_INTEROP_THREADS=1
+
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+source "quickstart/common/utils.sh"
+if [[ ${Tile} == "1" ]]; then
+  echo "Bert-large Inference"
+  _command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+    --model-name=bert_large \
+    --precision=${PRECISION} \
+    --mode=inference \
+    --in-graph=${FROZEN_GRAPH} \
+    --framework=tensorflow \
+    --batch-size=${BATCH_SIZE} \
+    --vocab-file=${PRETRAINED_DIR}/vocab.txt \
+    --config-file=${PRETRAINED_DIR}/bert_config.json \
+    --predict-file=${SQUAD_DIR}/dev-v1.1.json \
+    --output-dir ${OUTPUT_DIR} \
+    --max-seq-length=384 \
+    --doc-stride=128 \
+    --benchmark-only \
+    --gpu \
+    $@ \
+    -- infer-option=SQuAD 2>&1 | tee ${OUTPUT_DIR}//bert-large_inf_t0_raw.log
+  
+  elif [[ ${Tile} == "2" ]]; then
+    echo "Bert-large two tile Inference"
+    ZE_AFFINITY_MASK=0.0 _command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+    --model-name=bert_large \
+    --precision=${PRECISION} \
+    --mode=inference \
+    --in-graph=${FROZEN_GRAPH} \
+    --framework=tensorflow \
+    --batch-size=${BATCH_SIZE} \
+    --vocab-file=${PRETRAINED_DIR}/vocab.txt \
+    --config-file=${PRETRAINED_DIR}/bert_config.json \
+    --predict-file=${SQUAD_DIR}/dev-v1.1.json \
+    --output-dir ${OUTPUT_DIR} \
+    --max-seq-length=384 \
+    --doc-stride=128 \
+    --benchmark-only \
+    --gpu \
+    $@ \
+    -- infer-option=SQuAD 2>&1 | tee ${OUTPUT_DIR}//bert-large_inf_t0_raw.log &
+    ZE_AFFINITY_MASK=0.1 _command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+    --model-name=bert_large \
+    --precision=${PRECISION} \
+    --mode=inference \
+    --in-graph=${FROZEN_GRAPH} \
+    --framework=tensorflow \
+    --batch-size=${BATCH_SIZE} \
+    --vocab-file=${PRETRAINED_DIR}/vocab.txt \
+    --config-file=${PRETRAINED_DIR}/bert_config.json \
+    --predict-file=${SQUAD_DIR}/dev-v1.1.json \
+    --output-dir ${OUTPUT_DIR} \
+    --max-seq-length=384 \
+    --doc-stride=128 \
+    --benchmark-only \
+    --gpu \
+    $@ \
+    -- infer-option=SQuAD 2>&1 | tee ${OUTPUT_DIR}//bert-large_inf_t1_raw.log
+fi
diff --git a/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/build.sh b/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/build.sh
new file mode 100755
index 000000000..2b99231a8
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/inference/gpu/build.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE:-intel/intel-extension-for-tensorflow}
+TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG:-gpu-max}
+IMAGE_NAME=${IMAGE_NAME:-intel/language-modeling:tf-max-gpu-bert-large-inference}
+
+docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=tf-max-series-bert-large-inference \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE} \
+    --build-arg TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f tf-max-series-bert-large-inference.Dockerfile .
+
diff --git a/quickstart/language_modeling/tensorflow/bert_large/training/gpu/DEVCATALOG.md b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/DEVCATALOG.md
new file mode 100644
index 000000000..07cd3ae92
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/DEVCATALOG.md
@@ -0,0 +1,66 @@
+# TensorFlow BERT Large Training 
+
+## Description
+This document has instructions for running BERT Large training with BF16 precision using Intel(R) Extension for TensorFlow on Intel® Data Center GPU Max Series.
+
+## Datasets
+
+### Pre-trained models
+
+Download and extract the bert large uncased (whole word masking) pre-trained model checkpoints from the [google bert repo](https://github.com/google-research/bert#pre-trained-models). The extracted directory should be set to the `BERT_LARGE_DIR` environment variable when running the quickstart scripts. A dummy dataset will be auto generated and  used for training scripts.
+
+## Quick Start Scripts
+| Script name | Description |
+|-------------|-------------|
+| `bfloat16_training.sh` | Runs BERT Large BF16 training on two tiles|
+| `bfloat16_training_hvd.sh` | Runs BF16 Distributed Training using Intel(R) Optimized Horovod on two tiles | 
+
+## Docker
+Requirements:
+* Host machine has Intel(R) Data Center Max Series GPU
+* Follow instructions to install GPU-compatible driver [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html#ubuntu-22-04)
+* Docker
+
+## Docker pull Command
+```
+docker pull intel/language-modeling:tf-max-gpu-bert-large-training
+```
+The BERT Large training container includes scripts,models,libraries needed to run BF16 training. 
+
+```
+export BERT_LARGE_DIR=<path to pretrained model checkpoints>
+export OUTPUT_DIR=<path to output log files>
+IMAGE_NAME=intel/language-modeling:tf-max-gpu-bert-large-training
+DOCKER_ARGS="--rm -it"
+export SCRIPT=bfloat16_training.sh
+export PRECISION=bfloat16
+
+if [[ ${SCRIPT} == bfloat16_training.sh ]]; then
+   export Tile="${Tile:-2}"
+else
+   export Tile=1
+fi
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --env BERT_LARGE_DIR=${BERT_LARGE_DIR} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env Tile=${Tile} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --env PRECISION=${PRECISION} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume ${BERT_LARGE_DIR}:${BERT_LARGE_DIR} \
+  --volume /dev/dri:/dev/dri \
+  ${DOCKER_ARGS} \
+  $IMAGE_NAME \
+  /bin/bash quickstart/$SCRIPT
+  ```
\ No newline at end of file
diff --git a/quickstart/language_modeling/tensorflow/bert_large/training/gpu/README.md b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/README.md
new file mode 100644
index 000000000..4cd2ab9b5
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/README.md
@@ -0,0 +1,134 @@
+<!--- 0. Title -->
+# BERT Large training for Intel® Data Center GPU Max Series
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running BERT Large training using
+Intel-optimized TensorFlow with Intel® Data Center GPU Max Series.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Max Series, Driver Version: [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+
+## Software Requirements:
+- Intel® Data Center GPU Max Series
+- Intel GPU Drivers: Intel® Data Center GPU Max Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html)
+- Intel® oneAPI Base Toolkit 2023.0
+- TensorFlow 2.11.0 or 2.10.0
+- Python 3.7-3.10
+- pip 19.0 or later (requires manylinux2014 support)
+
+  |Release|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|
+    |v1.1.0|Intel® Data Center GPU Max Series|  Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/index.html#intel-data-center-gpu-max-series) for latest driver installation. If install the verified Intel® Data Center GPU Max Series/Intel® Data Center GPU Flex Series [540](https://dgpu-docs.intel.com/releases/stable_540_20221205.html), please append the specific version after components.|
+
+- Intel® oneAPI Base Toolkit 2023.0.0: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Threading Building Blocks (oneTBB)
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  - Intel® oneAPI Collective Communications Library (oneCCL), required by Intel® Optimization for Horovod* only
+  * Download and install the verified DPC++ compiler, oneTBB and oneMKL.
+    
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/19079/l_BaseKit_p_2023.0.0.25537_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, oneTBB and oneMKL
+    # if you want to run distributed training with Intel® Optimization for Horovod*, oneCCL is needed too(Intel® oneAPI MPI Library will be installed automatically as its dependency)
+    $ sudo sh ./l_BaseKit_p_2023.0.0.25537_offline.sh
+    ```
+    For any more details on instructions on how to download and install the base-kit, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html?operatingsystem=linux&distributions=offline.
+
+  - Set environment variables
+    Default installation location `{ONEAPI_ROOT}` is `/opt/intel/oneapi` for root account, `${HOME}/intel/oneapi` for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/compiler/latest/env/vars.sh
+    source {ONEAPI_ROOT}/mkl/latest/env/vars.sh
+    source {ONEAPI_ROOT}/tbb/latest/env/vars.sh
+
+    # oneCCL (and Intel® oneAPI MPI Library as its dependency), required by Intel® Optimization for Horovod* only
+    source {ONEAPI_ROOT}/mpi/latest/env/vars.sh
+    source {ONEAPI_ROOT}/ccl/latest/env/vars.sh
+    ```
+
+<!--- 30. Datasets -->
+## Datasets
+
+### Pretrained models
+
+Download and extract the bert large uncased (whole word masking) pretrained model checkpoints
+from the [google bert repo](https://github.com/google-research/bert#pre-trained-models).
+The extracted directory should be set to the `BERT_LARGE_DIR` environment
+variable when running the quickstart scripts. A dummy dataset will be auto generated and 
+used for training scripts.
+
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| [`bfloat16_training.sh`](bfloat16_training.sh) | bfloat16 precision script for bert large pretraining |
+| [`bfloat16_training_hvd.sh`](bfloat16_training_hvd.sh) | bfloat16 precision script for bert large pretraining with Intel® Optimization for Horovod* support |
+
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install TensorFlow and Intel® Extension for TensorFlow (ITEX):
+
+  The Intel® Extension for TensorFlow* requires stock TensorFlow, and the version should be == 2.11.0 or 2.10.0.
+
+  On Linux, it is often necessary to first update pip to a version that supports manylinux2014 wheels.
+  ```bash
+  pip install --upgrade pip
+  ```
+  
+  ```bash
+  pip install tensorflow==2.11.0
+  pip install --upgrade intel-extension-for-tensorflow[gpu]
+  ```
+   To verify that TensorFlow and ITEX are correctly installed:
+  ```
+  python -c "import intel_extension_for_tensorflow as itex; print(itex.__version__)"
+  ```
+* Clone the Model Zoo repository:
+  ```bash
+  git clone https://github.com/IntelAI/models.git
+  ```
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading the pretrained model. A path to
+this directory will need to be set in the `BERT_LARGE_DIR`
+environment variable prior to running a [quickstart script](#quick-start-scripts).
+
+### Run the model on Baremetal
+Navigate to the BERT Large training directory, and set environment variables:
+```
+cd models
+
+export OUTPUT_DIR=<path where output log files will be written>
+export PRECISION=bfloat16
+export BERT_LARGE_DIR=<path to the wwm_uncased_L-24_H-1024_A-16 directory>
+
+# Set the following `Tile` env variable only for running `bfloat16_training.sh` script:
+export Tile=2
+
+# Run `bfloat16_training.sh` script:
+./quickstart/language_modeling/tensorflow/bert_large/training/gpu/bfloat_training.sh
+
+# To run `bfloat16_training_hvd.sh` script:
+# Install `bfloat16_training_hvd.sh` script specific dependencies:
+./quickstart/language_modeling/tensorflow/bert_large/training/gpu/setup.sh
+./quickstart/language_modeling/tensorflow/bert_large/training/gpu/bfloat_training_hvd.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/language_modeling/tensorflow/bert_large/training/gpu/bfloat16_training.sh b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/bfloat16_training.sh
new file mode 100755
index 000000000..0fe76c9a7
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/bfloat16_training.sh
@@ -0,0 +1,153 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+export MODEL_DIR=${MODEL_DIR-$PWD}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo 'BERT_LARGE_DIR='$BERT_LARGE_DIR
+
+if [[ -z $OUTPUT_DIR ]]; then
+  echo "The required environment variable OUTPUT_DIR has not been set" >&2
+  exit 1
+fi
+
+# Create the output directory, if it doesn't already exist
+mkdir -p $OUTPUT_DIR
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_dirs
+input_dirs[BERT_LARGE_DIR]=${BERT_LARGE_DIR}
+
+for i in "${!input_dirs[@]}"; do
+  var_name=$i
+  dir_path=${input_dirs[$i]}
+ 
+  if [[ -z $dir_path ]]; then
+    echo "The required environment variable $var_name is empty" >&2
+    exit 1
+  fi
+done
+
+# Check for precision
+if [[ $PRECISION != "bfloat16" ]]; then
+  echo "The specified precision '${PRECISION}' is unsupported."
+  echo "Only bfloat16 precision is supported"
+  exit 1
+fi
+
+# Create Dummy dataset from bert large dataset
+export DUMMY_DATA=${OUTPUT_DIR}/tf-examples-512.tfrecord
+echo 'DUMMY_DATA='$DUMMY_DATA
+
+$MODEL_DIR/quickstart/language_modeling/tensorflow/bert_large/training/gpu/generate_pretraining_data.sh
+
+export TF_NUM_INTEROP_THREADS=1
+
+if [ -z "${BATCH_SIZE}" ]; then
+      BATCH_SIZE="32"
+      echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+source "${MODEL_DIR}/quickstart/common/utils.sh"
+if [[ ${Tile} == "1" ]]; then
+  echo "Bert-Large bf16 training"
+  _command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+  --model-name=bert_large \
+  --precision=${PRECISION} \
+  --mode=training \
+  --framework=tensorflow \
+  --batch-size=${BATCH_SIZE} \
+  --output-dir ${OUTPUT_DIR} \
+  --gpu \
+  $@ \
+  -- train_option=Pretraining \
+  input_file=$DUMMY_DATA \
+  do_train=True \
+  do_eval=False \
+  config_file=${BERT_LARGE_DIR}/bert_config.json \
+  max_seq_length=512 \
+  max_predictions=76 \
+  num_train_steps=20 \
+  warmup_steps=10 \
+  accum_steps=1 \
+  learning_rate=2e-5 \
+  do_lower_case=False \
+  mpi_workers_sync_gradients=False \
+  experimental_gelu=True \
+  optimized_softmax=True \
+  inter_op_parallelism_threads=1 \
+  intra_op_parallelism_threads=1
+
+elif [[ ${Tile} == "2" ]]; then
+    echo "Bert Large bf16 two-tile training"
+    ZE_AFFINITY_MASK=0.0  _command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+        --model-name=bert_large \
+        --precision=${PRECISION} \
+        --mode=training \
+        --framework=tensorflow \
+        --batch-size=${BATCH_SIZE} \
+        --output-dir ${OUTPUT_DIR} \
+        --gpu \
+        $@ \
+        -- train_option=Pretraining \
+        input_file=$DUMMY_DATA \
+        do_train=True \
+        do_eval=False \
+        config_file=${BERT_LARGE_DIR}/bert_config.json \
+        max_seq_length=512 \
+        max_predictions=76 \
+        num_train_steps=20 \
+        warmup_steps=10 \
+        accum_steps=1 \
+        learning_rate=2e-5 \
+        do_lower_case=False \
+        mpi_workers_sync_gradients=False \
+        experimental_gelu=True \
+        optimized_softmax=True \
+        inter_op_parallelism_threads=1 \
+        intra_op_parallelism_threads=1 2>&1 | tee ${OUTPUT_DIR}//bert_large_bf16_trn_t0_raw.log &
+    ZE_AFFINITY_MASK=0.1  _command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+        --model-name=bert_large \
+        --precision=${PRECISION} \
+        --mode=training \
+        --framework=tensorflow \
+        --batch-size=${BATCH_SIZE} \
+        --output-dir ${OUTPUT_DIR} \
+        --gpu \
+        $@ \
+        -- train_option=Pretraining \
+        input_file=$DUMMY_DATA \
+        do_train=True \
+        do_eval=False \
+        config_file=${BERT_LARGE_DIR}/bert_config.json \
+        max_seq_length=512 \
+        max_predictions=76 \
+        num_train_steps=20 \
+        warmup_steps=10 \
+        accum_steps=1 \
+        learning_rate=2e-5 \
+        do_lower_case=False \
+        mpi_workers_sync_gradients=False \
+        experimental_gelu=True \
+        optimized_softmax=True \
+        inter_op_parallelism_threads=1 \
+        intra_op_parallelism_threads=1 2>&1 | tee ${OUTPUT_DIR}//bert_large_bf16_trn_t1_raw.log
+else 
+    echo "Tiles 1 and 2 are supported."
+    exit 1
+fi
diff --git a/quickstart/language_modeling/tensorflow/bert_large/training/gpu/bfloat16_training_hvd.sh b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/bfloat16_training_hvd.sh
new file mode 100755
index 000000000..19d21ee44
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/bfloat16_training_hvd.sh
@@ -0,0 +1,90 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+export MODEL_DIR=${MODEL_DIR-$PWD}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo 'BERT_LARGE_DIR='$BERT_LARGE_DIR
+
+if [[ -z $OUTPUT_DIR ]]; then
+  echo "The required environment variable OUTPUT_DIR has not been set" >&2
+  exit 1
+fi
+
+# Create the output directory, if it doesn't already exist
+mkdir -p $OUTPUT_DIR
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_dirs
+input_dirs[BERT_LARGE_DIR]=${BERT_LARGE_DIR}
+
+for i in "${!input_dirs[@]}"; do
+  var_name=$i
+  dir_path=${input_dirs[$i]}
+ 
+  if [[ -z $dir_path ]]; then
+    echo "The required environment variable $var_name is empty" >&2
+    exit 1
+  fi
+done
+
+# Check for precision
+if [[ $PRECISION != "bfloat16" ]]; then
+  echo "The specified precision '${PRECISION}' is unsupported."
+  echo "Only bfloat16 precision is supported"
+  exit 1
+fi
+
+# Create Dummy dataset from bert large dataset
+export DUMMY_DATA=${OUTPUT_DIR}/tf-examples-512.tfrecord
+echo 'DUMMY_DATA='$DUMMY_DATA
+$MODEL_DIR/quickstart/language_modeling/tensorflow/bert_large/training/gpu/generate_pretraining_data.sh
+
+export TF_NUM_INTEROP_THREADS=1
+
+export NUMBER_OF_PROCESS=2
+export PROCESS_PER_NODE=2
+
+if [ -z "${BATCH_SIZE}" ]; then
+      BATCH_SIZE="32"
+      echo "Running with default batch size of ${BATCH_SIZE}"
+fi
+
+source "${MODEL_DIR}/quickstart/common/utils.sh"
+_command mpirun -np $NUMBER_OF_PROCESS -ppn $PROCESS_PER_NODE --prepend-rank \
+  python ${MODEL_DIR}/models/language_modeling/tensorflow/bert_large/training/bfloat16/run_pretraining.py \
+  --input_file=$DUMMY_DATA \
+  --output_dir=${OUTPUT_DIR} \
+  --precision=${PRECISION} \
+  --do_train=True \
+  --do_eval=False \
+  --bert_config_file=${BERT_LARGE_DIR}/bert_config.json \
+  --train_batch_size=${BATCH_SIZE} \
+  --max_seq_length=512 \
+  --max_predictions_per_seq=76 \
+  --num_train_steps=120 \
+  --num_warmup_steps=6 \
+  --accum_steps=1 \
+  --learning_rate=2e-5 \
+  --do_lower_case=False \
+  --mpi_workers_sync_gradients=False \
+  --use_tpu=False \
+  --experimental_gelu=True \
+  --optimized_softmax=True \
+  --inter_op_parallelism_threads=1 \
+  --intra_op_parallelism_threads=1 2>&1 | tee ${OUTPUT_DIR}//bfloat16_trn_hvd.log
diff --git a/quickstart/language_modeling/tensorflow/bert_large/training/gpu/build.sh b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/build.sh
new file mode 100755
index 000000000..2277e90e8
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/build.sh
@@ -0,0 +1,35 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE:-intel/intel-extension-for-tensorflow}
+TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG:-gpu-max}
+IMAGE_NAME=${IMAGE_NAME:-intel/language-modeling:tf-max-gpu-bert-large-training}
+
+docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=tf-max-series-bert-large-training \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE} \
+    --build-arg TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f tf-max-series-bert-large-training.Dockerfile .
+
diff --git a/quickstart/language_modeling/tensorflow/bert_large/training/gpu/generate_pretraining_data.sh b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/generate_pretraining_data.sh
new file mode 100755
index 000000000..cd10dc63f
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/generate_pretraining_data.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+echo "----------- Start: DUMMY DATA generation --------------"
+# Assumption is BERT_LARGE_DIR, DUMMY_DATA and MODEL_DIR is set.
+SOURCE_DIR=${MODEL_DIR}/models/language_modeling/tensorflow/bert_large/training/fp32
+
+pushd $SOURCE_DIR
+
+python create_pretraining_data.py \
+        --input_file=./sample_text.txt \
+        --output_file=$DUMMY_DATA \
+        --vocab_file=$BERT_LARGE_DIR/vocab.txt \
+        --do_lower_case=False \
+        --max_seq_length=512 \
+        --max_predictions_per_seq=76 \
+        --masked_lm_prob=0.15 \
+        --random_seed=12345 \
+        --dupe_factor=5
+
+popd
+
+echo "----------- End: DUMMY DATA generation --------------"
diff --git a/quickstart/language_modeling/tensorflow/bert_large/training/gpu/hvs_support.patch b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/hvs_support.patch
new file mode 100644
index 000000000..72d520c0f
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/hvs_support.patch
@@ -0,0 +1,26 @@
+diff --git a/models/language_modeling/tensorflow/bert_large/training/bfloat16/optimization.py b/models/language_modeling/tensorflow/bert_large/training/bfloat16/optimization.py
+index d101ab433..d794ad45d 100644
+--- a/models/language_modeling/tensorflow/bert_large/training/bfloat16/optimization.py
++++ b/models/language_modeling/tensorflow/bert_large/training/bfloat16/optimization.py
+@@ -68,7 +68,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, accum_ste
+ 
+   if use_multi_cpu and (accum_steps == 1):
+     import horovod.tensorflow as hvd
+-    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True)
++    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, num_groups=1)
+ 
+   if use_tpu:
+     optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer)
+diff --git a/models/language_modeling/tensorflow/bert_large/training/fp32/optimization.py b/models/language_modeling/tensorflow/bert_large/training/fp32/optimization.py
+index 233e1007d..027dd3e00 100644
+--- a/models/language_modeling/tensorflow/bert_large/training/fp32/optimization.py
++++ b/models/language_modeling/tensorflow/bert_large/training/fp32/optimization.py
+@@ -97,7 +97,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, accum_ste
+ 
+   if use_multi_cpu and (accum_steps == 1):
+     import horovod.tensorflow as hvd
+-    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True)
++    optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, num_groups=1)
+ 
+   if use_tpu:
+     optimizer = tf.compat.v1.tpu.CrossShardOptimizer(optimizer)
diff --git a/quickstart/language_modeling/tensorflow/bert_large/training/gpu/setup.sh b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/setup.sh
new file mode 100644
index 000000000..db850dd2c
--- /dev/null
+++ b/quickstart/language_modeling/tensorflow/bert_large/training/gpu/setup.sh
@@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2023 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+export MODEL_DIR=${MODEL_DIR-$PWD}
+
+HOROVOD_PATCH=${MODEL_DIR}/quickstart/language_modeling/tensorflow/bert_large/training/gpu/hvs_support.patch
+
+cd ${MODEL_DIR}
+git apply ${HOROVOD_PATCH}
+cd -
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/baremetal.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/baremetal.md
new file mode 100644
index 000000000..6ae4a5b23
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/baremetal.md
@@ -0,0 +1,46 @@
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Python version 3.9
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install PyTorch and Intel® Extension for PyTorch for GPU (IPEX):
+  ```bash
+  python -m pip install torch==1.10.0a0 -f https://developer.intel.com/ipex-whl-stable-xpu
+  python -m pip install numpy==1.23.4
+  python -m pip install intel_extension_for_pytorch==1.10.200+gpu -f https://developer.intel.com/ipex-whl-stable-xpu
+  ```
+  To verify that PyTorch and IPEX are correctly installed:
+  ```bash
+  python -c "import torch;print(torch.device('xpu'))"  # Sample output: "xpu"
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())"  #Sample output True
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.has_onemkl())"  # Sample output: True
+  ```
+* Navigate to SSD-Mobilenet inference directory and install model specific dependencies for the workload:
+  ```bash
+  cd quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu
+  ./setup.sh
+  cd -
+  ```
+* Download the dataset label file, and set the "label" environment variable to point to where it was saved (for example: `export label=/home/<user>/voc-model-labels.txt`):
+  ```bash
+  wget https://storage.googleapis.com/models-hao/voc-model-labels.txt
+  ```
+
+This snippet shows how to run the inference quickstart script. The inference script
+will download the model weights to the directory location set in 'PRETRAINED_MODEL'.
+
+```
+### Run the model on Baremetal
+Set environment variables:
+export DATASET_DIR=<Path to the VOC2007 folder>
+export OUTPUT_DIR=<Path to save the output logs>
+export PRETRAINED_MODEL=<path to directory where the model weights will be loaded>
+export label=<path to label.txt file>
+
+Run the inference script, only int8 precision is supported:
+./quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/inference_with_dummy_data.sh
+```
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/datasets.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/datasets.md
new file mode 100644
index 000000000..dda3d9138
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/datasets.md
@@ -0,0 +1,37 @@
+<!--- 30. Datasets -->
+## Datasets
+
+The [VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/) validation dataset is used.
+
+Download and extract the VOC2007 dataset from http://host.robots.ox.ac.uk/pascal/VOC/voc2007/,
+After extracting the data, your folder structure should look something like this:
+
+```
+VOC2007
+├── Annotations
+│   ├── 000038.xml    
+│   ├── 000724.xml
+│   ├── 001440.xml
+│   └── ...
+├── ImageSets
+│   ├── Layout    
+│   ├── Main
+│   └── Segmentation
+├── SegmentationClass
+│   ├── 005797.png   
+│   ├── 007415.png 
+│   ├── 006581.png 
+│   └── ...
+├── SegmentationObject
+│   ├── 005797.png    
+│   ├── 006581.png
+│   ├── 007415.png
+│   └── ...
+└── JPEGImages
+    ├── 002832.jpg    
+    ├── 003558.jpg
+    ├── 004262.jpg
+    └── ...
+```
+The folder should be set as the `DATASET_DIR`
+(for example: `export DATASET_DIR=/home/<user>/VOC2007`).
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/description.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/description.md
new file mode 100644
index 000000000..1d4003781
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/description.md
@@ -0,0 +1,5 @@
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running <model name> <mode> using
+Intel(R) Extension for PyTorch with GPU.
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/docker.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/docker.md
new file mode 100644
index 000000000..438e8fea4
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/docker.md
@@ -0,0 +1,43 @@
+<!--- 60. Docker -->
+## Run the model
+
+Requirements:
+* Host machine has Intel® Data Center GPU Flex Series.
+* Host machine has the Intel® Data Center GPU Flex Series Ubuntu driver. Please follow the [link](https://registrationcenter.intel.com/en/products/download/4125/) to download.
+* Host machine has Docker installed
+* Download and build the Intel® Extension for PyTorch (IPEX) container using the [link](https://registrationcenter.intel.com/en/products/subscription/956/).
+  (`model-zoo:pytorch-ipex-gpu`)
+
+Prior to building the <model name> <mode> container, ensure that you have
+built the IPEX container (`model-zoo:pytorch-ipex-gpu`).
+
+[Extract the package](#model-package), then use the `build.sh`
+script to build the container. After the container has been built, you can
+run the model <mode> using the `run.sh` script.
+
+The `run.sh` script will execute one of the [quickstart](#quick-start-scripts) script
+using the container that was just built. By default, the
+`inference_block_format.sh` script will be run. To run a different script,
+specify the script name of the quickstart script using the `SCRIPT`
+environment variable. See the snippet below for an example.
+
+> Note: Ensure that your system has the proxy environment variables
+> set (if needed), otherwise the container build may fail when trying to pull external
+> dependencies (like apt-get and pip installs).
+
+The inference scripts will download the model weights from https://storage.googleapis.com/models-hao/mobilenet-v1-ssd-mp-0_675.pth to the anywhere directory you choice, and set environment 'PRETRAINED_MODEL'.
+```
+# Extract the package
+tar -xzf <package name>
+
+# Navigate to the SSD-Mobilenet inference directory
+cd pytorch-gpu-ssd-mobilenet-inference
+
+# Set environment vars for the dataset and an output directory
+export DATASET_DIR=<Path to the VOC2007 folder>
+export OUTPUT_DIR=<Where_to_save_OUTPUT_DIR>
+export PRETRAINED_MODEL=<Where_to_load_path>
+
+# Run the container with default script in run.sh (inference_with_dummy_data.sh)
+./run.sh
+```
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/gpu_setup.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/gpu_setup.md
new file mode 100644
index 000000000..46a1c07ec
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/gpu_setup.md
@@ -0,0 +1,30 @@
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Flex Series
+
+## Software Requirements:
+- Ubuntu 20.04 (64-bit)
+- Intel GPU Drivers: Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html)
+
+  |Release|OS|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|-|
+    |v1.0.0|Ubuntu 20.04|Intel® Data Center GPU Flex Series| Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal-dc.html) for latest driver installation. If install the verified Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html), please append the specific version after components, such as `apt-get install intel-opencl-icd=22.28.23726.1+i419~u20.04`|
+
+- Intel® oneAPI Base Toolkit 2022.3: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler and oneMKL in Ubuntu 20.04.
+
+    ```bash
+    wget https://registrationcenter-download.intel.com/akdlm/irc_nas/18852/l_BaseKit_p_2022.3.0.8767_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, Threading Building Blocks and oneMKL
+    sh ./l_BaseKit_p_2022.3.0.8767_offline.sh
+    ```
+    For any more details, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html.
+
+  - Set environment variables: 
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/setvars.sh
+    ```
+
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/license.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/license.md
new file mode 100644
index 000000000..da5269ad2
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/license.md
@@ -0,0 +1,4 @@
+<!--- 50. License -->
+## License
+
+[LICENSE](/LICENSE)
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/package.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/package.md
new file mode 100644
index 000000000..c2317f9ba
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/package.md
@@ -0,0 +1,21 @@
+<!--- 20. Model package -->
+## Model Package
+
+The model package includes the scripts and libraries needed to
+build and run <model name> <mode> using a docker container. Note that
+this model container uses the PyTorch IPEX GPU container as it's base,
+and it requires the `model-zoo:pytorch-ipex-gpu` image to be built before
+the model container is built.
+```
+<package dir>
+├── build.sh
+├── info.txt
+├── licenses
+│   ├── LICENSE
+│   └── third_party
+├── model_packages
+│   └── <package name>
+├── <package dir>.Dockerfile
+├── README.md
+└── run.sh
+```
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/quickstart.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/quickstart.md
new file mode 100644
index 000000000..3970f4458
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/quickstart.md
@@ -0,0 +1,6 @@
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `inference_with_dummy_data.sh` | Inference with dummy data, batch size 512, for int8 blocked channel first. |
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/title.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/title.md
new file mode 100644
index 000000000..1d6c60187
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/title.md
@@ -0,0 +1,2 @@
+<!--- 0. Title -->
+# <model name> <mode>
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/README.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/README.md
new file mode 100644
index 000000000..04da8268e
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/README.md
@@ -0,0 +1,137 @@
+<!--- 0. Title -->
+# SSD-Mobilenetv1 inference
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running SSD-Mobilenetv1 inference using
+Intel(R) Extension for PyTorch with GPU.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Flex Series
+
+## Software Requirements:
+- Ubuntu 20.04 (64-bit)
+- Intel GPU Drivers: Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html)
+
+  |Release|OS|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|-|
+    |v1.0.0|Ubuntu 20.04|Intel® Data Center GPU Flex Series| Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal-dc.html) for latest driver installation. If install the verified Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html), please append the specific version after components, such as `apt-get install intel-opencl-icd=22.28.23726.1+i419~u20.04`|
+
+- Intel® oneAPI Base Toolkit 2022.3: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler and oneMKL in Ubuntu 20.04.
+
+    ```bash
+    wget https://registrationcenter-download.intel.com/akdlm/irc_nas/18852/l_BaseKit_p_2022.3.0.8767_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, Threading Building Blocks and oneMKL
+    sh ./l_BaseKit_p_2022.3.0.8767_offline.sh
+    ```
+    For any more details, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html.
+
+  - Set environment variables: 
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/setvars.sh
+    ```
+
+
+<!--- 30. Datasets -->
+## Datasets
+
+The [VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/) validation dataset is used.
+
+Download and extract the VOC2007 dataset from http://host.robots.ox.ac.uk/pascal/VOC/voc2007/,
+After extracting the data, your folder structure should look something like this:
+
+```
+VOC2007
+├── Annotations
+│   ├── 000038.xml    
+│   ├── 000724.xml
+│   ├── 001440.xml
+│   └── ...
+├── ImageSets
+│   ├── Layout    
+│   ├── Main
+│   └── Segmentation
+├── SegmentationClass
+│   ├── 005797.png   
+│   ├── 007415.png 
+│   ├── 006581.png 
+│   └── ...
+├── SegmentationObject
+│   ├── 005797.png    
+│   ├── 006581.png
+│   ├── 007415.png
+│   └── ...
+└── JPEGImages
+    ├── 002832.jpg    
+    ├── 003558.jpg
+    ├── 004262.jpg
+    └── ...
+```
+The folder should be set as the `DATASET_DIR`
+(for example: `export DATASET_DIR=/home/<user>/VOC2007`).
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `inference_with_dummy_data.sh` | Inference with dummy data, batch size 512, for int8 blocked channel first. |
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Python version 3.9
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install PyTorch and Intel® Extension for PyTorch for GPU (IPEX):
+  ```bash
+  python -m pip install torch==1.10.0a0 -f https://developer.intel.com/ipex-whl-stable-xpu
+  python -m pip install numpy==1.23.4
+  python -m pip install intel_extension_for_pytorch==1.10.200+gpu -f https://developer.intel.com/ipex-whl-stable-xpu
+  ```
+  To verify that PyTorch and IPEX are correctly installed:
+  ```bash
+  python -c "import torch;print(torch.device('xpu'))"  # Sample output: "xpu"
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())"  #Sample output True
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.has_onemkl())"  # Sample output: True
+  ```
+* Navigate to SSD-Mobilenet inference directory and install model specific dependencies for the workload:
+  ```bash
+  cd quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu
+  ./setup.sh
+  cd -
+  ```
+* Download the dataset label file, and set the "label" environment variable to point to where it was saved (for example: `export label=/home/<user>/voc-model-labels.txt`):
+  ```bash
+  wget https://storage.googleapis.com/models-hao/voc-model-labels.txt
+  ```
+
+This snippet shows how to run the inference quickstart script. The inference script
+will download the model weights to the directory location set in 'PRETRAINED_MODEL'.
+
+```
+### Run the model on Baremetal
+Set environment variables:
+export DATASET_DIR=<Path to the VOC2007 folder>
+export OUTPUT_DIR=<Path to save the output logs>
+export PRETRAINED_MODEL=<path to directory where the model weights will be loaded>
+export label=<path to label.txt file>
+
+Run the inference script, only int8 precision is supported:
+./quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/inference_with_dummy_data.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/build.sh b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/build.sh
new file mode 100755
index 000000000..fa29b3f53
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/build.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE:-intel/intel-extension-for-pytorch}
+PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG:-gpu}
+IMAGE_NAME=${IMAGE_NAME:-model-zoo:pytorch-atsm-ssd-mobilenet-inference}
+
+if [ "$(docker images -q ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG})" == "" ]; then
+  echo "The Intel(R) Extension for PyTorch container (${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}) was not found."
+  echo "This container is required, as it is used as the base for building the ResNet50v1.5 inference container."
+  echo "Please download the IPEX container package and build the image and then retry this build."
+  exit 1
+fi
+
+docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=pytorch-atsm-ssd-mobilenet-inference \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE} \
+    --build-arg PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f pytorch-atsm-ssd-mobilenet-inference.Dockerfile .
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/devcatalog.md b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/devcatalog.md
new file mode 100644
index 000000000..b356da617
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/devcatalog.md
@@ -0,0 +1,118 @@
+# Running SSD-MobileNetv1 Inference on Intel® Data Center GPU Flex Series using Intel® Extension for PyTorch*
+
+## Overview
+
+This document has instructions for running SSD-Mobilenetv1 inference using
+Intel(R) Extension for PyTorch with GPU.
+
+## Requirements
+| Item | Detail |
+| ------ | ------- |
+| Host machine  | Intel® Data Center GPU Flex Series  |
+| Drivers | GPU-compatible drivers need to be installed: [Download Driver 476.14](https://dgpu-docs.intel.com/releases/stable_476_14_20221021.html)
+| Software | Docker* Installed |
+
+## Download Datasets
+
+The [VOC2007](http://host.robots.ox.ac.uk/pascal/VOC/voc2007/) validation dataset is used.
+
+Download and extract the VOC2007 dataset from http://host.robots.ox.ac.uk/pascal/VOC/voc2007/,
+After extracting the data, your folder structure should look something like this:
+
+```
+VOC2007
+├── Annotations
+│   ├── 000038.xml    
+│   ├── 000724.xml
+│   ├── 001440.xml
+│   └── ...
+├── ImageSets
+│   ├── Layout    
+│   ├── Main
+│   └── Segmentation
+├── SegmentationClass
+│   ├── 005797.png   
+│   ├── 007415.png 
+│   ├── 006581.png 
+│   └── ...
+├── SegmentationObject
+│   ├── 005797.png    
+│   ├── 006581.png
+│   ├── 007415.png
+│   └── ...
+└── JPEGImages
+    ├── 002832.jpg    
+    ├── 003558.jpg
+    ├── 004262.jpg
+    └── ...
+```
+The folder should be set as the `DATASET_DIR`
+(for example: `export DATASET_DIR=/home/<user>/VOC2007`).
+
+## Pretrained Model
+
+You are required to create the model folder and set environment `PRETRAINED_MODEL`. If the folder is empty, the code downloads the pre-trained model.
+
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `inference_with_dummy_data.sh` | Inference with dummy data, batch size 512, for int8 blocked channel first. |
+
+## Run Using Docker
+
+### Set up Docker Image
+
+```
+docker pull intel/object-detection:pytorch-flex-gpu-ssd-mobilenet-inference
+```
+### Run Docker Image
+The SSD-MobileNet inference container includes scripts,model and libraries need to run int8 inference. To run the `inference_with_dummy_data.sh` quickstart script using this container, you'll need to provide volume mounts for the VOC2007 dataset. You will need to provide an output directory where log files will be written.
+
+```
+export PRECISION=int8
+export OUTPUT_DIR=<path to output directory>
+export DATASET_DIR=<path to the preprocessed voc2007 dataset>
+export PRETRAINED_MODEL=<path to the pretrained model folder. The code downloads the model if this folder is empty>
+export SCRIPT=quickstart/inference_with_dummy_data.sh
+export label=/workspace/pytorch-atsm-ssd-mobilenet-inference/labels/voc-model-labels.txt
+
+DOCKER_ARGS=${DOCKER_ARGS:---rm -it}
+IMAGE_NAME=intel/object-detection:pytorch-flex-gpu-ssd-mobilenet-inference 
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  -v <your-local-dir>:/workspace \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --env PRECISION=${PRECISION} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env PRETRAINED_MODEL=${PRETRAINED_MODEL} \
+  --env label=${label} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume ${PRETRAINED_MODEL}:${PRETRAINED_MODEL} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  ${DOCKER_ARGS} \
+  ${IMAGE_NAME} \
+  /bin/bash $SCRIPT
+  ```
+
+## Documentation and Sources
+
+[GitHub* Repository](https://github.com/IntelAI/models/tree/master/dockerfiles/model_containers)
+
+## Support
+Support for Intel® Extension for PyTorch* is found via the [Intel® AI Analytics Toolkit.](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html#gs.qbretz) Additionally, the Intel® Extension for PyTorch* team tracks both bugs and enhancement requests using [GitHub issues](https://github.com/intel/intel-extension-for-pytorch/issues). Before submitting a suggestion or bug report, please search the GitHub issues to see if your issue has already been reported.
+
+## License Agreement
+
+LEGAL NOTICE: By accessing, downloading or using this software and any required dependent software (the “Software Package”), you agree to the terms and conditions of the software license agreements for the Software Package, which may also include notices, disclaimers, or license terms for third party software included with the Software Package. Please refer to the [license file](https://github.com/IntelAI/models/tree/master/third_party) for additional details.
\ No newline at end of file
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/inference_with_dummy_data.sh b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/inference_with_dummy_data.sh
new file mode 100755
index 000000000..fbdf2c804
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/inference_with_dummy_data.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+echo 'label='$label
+
+./quickstart/setup.sh
+
+if [[ -z "${DATASET_DIR}" ]]; then
+  echo "The required environment variable DATASET_DIR has not been set"
+  exit 1
+fi
+
+# Create the model weights directory, if it doesn't already exist
+mkdir -p $PRETRAINED_MODEL
+
+# Download the weights file if it does not already exist
+WEIGHTS_FILE="$PRETRAINED_MODEL/mobilenet-v1-ssd-mp-0_675.pth"
+if [[ ! -f "$WEIGHTS_FILE" ]]; then
+  echo "The weights file was not found at $WEIGHTS_FILE, so the file will be downloaded"
+  wget https://storage.googleapis.com/models-hao/mobilenet-v1-ssd-mp-0_675.pth -P $PRETRAINED_MODEL
+else
+  echo "The weights file was found at $WEIGHTS_FILE"
+fi
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_envs
+
+input_envs[label]=${label}
+
+if [[ -z $OUTPUT_DIR ]]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+# Create the output directory, if it doesn't already exist
+mkdir -p $OUTPUT_DIR
+
+echo "ssd-mobilenet dummy data inference  nchw"
+IPEX_XPU_ONEDNN_LAYOUT=1 python -u ${MODEL_DIR}/models/object_detection/pytorch/ssd-mobilenet/inference/gpu/eval_ssd.py \
+  --net mb1-ssd \
+  --dataset ${DATASET_DIR} \
+  --trained_model ${WEIGHTS_FILE} \
+  --label_file ${label} \
+  --dummy 1 \
+  --batch_size 512 \
+  --benchmark 1 \
+  --num-iterations 500 \
+  --int8 2>&1 | tee $OUTPUT_DIR/ssd_mobilenetv1_dummy_data_xpu_inf.log
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/run.sh b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/run.sh
new file mode 100755
index 000000000..559d04a06
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/run.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if [ -z "${OUTPUT_DIR}" ]; then
+  echo "The required environment variable OUTPUT_DIR has not been set"
+  exit 1
+fi
+
+if [ -z "${PRETRAINED_MODEL}" ]; then
+  echo "The required environment variable PRETRAINED_MODEL has not been set"
+  exit 1
+fi
+
+IMAGE_NAME=${IMAGE_NAME:-model-zoo:pytorch-gpu-ssd-mobilenet-inference}
+DOCKER_ARGS=${DOCKER_ARGS:---rm -it}
+
+# inference scripts:
+# inference_with_dummy_data.sh
+
+export SCRIPT="${SCRIPT:-inference_with_dummy_data.sh}"
+
+# The dataset directory is not required for the synthetic data script
+if [[ ${SCRIPT} != inference_with_dummy_data.sh ]]; then
+  if [ -z "${DATASET_DIR}" ]; then
+    echo "The required environment variable DATASET_DIR has not been set"
+    exit 1
+  fi
+fi
+
+export label=/workspace/pytorch-atsm-ssd-mobilenet-inference/labels/voc-model-labels.txt
+
+if [[ ${SCRIPT} != quickstart* ]]; then
+  SCRIPT="quickstart/$SCRIPT"
+fi
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env PRETRAINED_MODEL=${PRETRAINED_MODEL} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env label=${label} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  ${DOCKER_ARGS} \
+  $IMAGE_NAME \
+  /bin/bash $SCRIPT
diff --git a/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/setup.sh b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/setup.sh
new file mode 100755
index 000000000..5c75091aa
--- /dev/null
+++ b/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/setup.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+pip install torchvision==0.8.2 --no-deps
+pip install opencv-python-headless
+pip install pandas
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/baremetal.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/baremetal.md
new file mode 100644
index 000000000..1330ed1ba
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/baremetal.md
@@ -0,0 +1,42 @@
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Python version 3.9
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install PyTorch and Intel® Extension for PyTorch for GPU (IPEX):
+  ```bash
+  python -m pip install torch==1.10.0a0 -f https://developer.intel.com/ipex-whl-stable-xpu
+  python -m pip install numpy==1.23.4
+  python -m pip install intel_extension_for_pytorch==1.10.200+gpu -f https://developer.intel.com/ipex-whl-stable-xpu
+  ```
+  To verify that PyTorch and IPEX are correctly installed:
+  ```bash
+  python -c "import torch;print(torch.device('xpu'))"  # Sample output: "xpu"
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())"  #Sample output True
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.has_onemkl())"  # Sample output: True
+  ```
+* Navigate to Yolov4 inference directory and install model specific dependencies for the workload:
+  ```bash
+  cd quickstart/object_detection/pytorch/yolov4/inference/gpu
+  ./setup.sh
+  cd -
+  ```
+* Download the pretrained weights file, and set the PRETRAINED_MODEL environment variable to point to where it was saved:
+  ```bash
+  wget https://drive.google.com/open?id=1wv_LiFeCRYwtpkqREPeI13-gPELBDwuJ
+  ```
+
+### Run the model on Baremetal
+```
+Set environment variables:
+export DATASET_DIR=<path where yolov4 COCO dataset>
+export PRETRAINED_MODEL=<path to directory where the pretrained weights file was saved>
+export OUTPUT_DIR=<Path to save the output logs>
+
+Run the inference script, only int8 precision is supported:
+./quickstart/object_detection/pytorch/yolov4/inference/gpu/inference_with_dummy_data.sh
+```
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/datasets.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/datasets.md
new file mode 100644
index 000000000..40207d49f
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/datasets.md
@@ -0,0 +1,32 @@
+<!--- 30. Datasets -->
+## Datasets
+
+Download and extract the 2017 training/validation images and annotations from the
+[COCO dataset website](https://cocodataset.org/#download) to a `coco` folder
+and unzip the files. After extracting the zip files, your dataset directory
+structure should look something like this:
+```
+coco
+├── annotations
+│   ├── captions_train2017.json
+│   ├── captions_val2017.json
+│   ├── instances_train2017.json
+│   ├── instances_val2017.json
+│   ├── person_keypoints_train2017.json
+│   └── person_keypoints_val2017.json
+├── train2017
+│   ├── 000000454854.jpg
+│   ├── 000000137045.jpg
+│   ├── 000000129582.jpg
+│   └── ...
+└── val2017
+    ├── 000000000139.jpg
+    ├── 000000000285.jpg
+    ├── 000000000632.jpg
+    └── ...
+```
+The parent of the `annotations`, `train2017`, and `val2017` directory (in this example `coco`)
+is the directory that should be used when setting the `image` environment
+variable for YOLOv4 (for example: `export image=/home/<user>/coco/val2017/000000581781.jpg`).
+In addition, we should also set the `size` environment to match the size of image.
+(for example: `export size=416`)
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/description.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/description.md
new file mode 100644
index 000000000..1d4003781
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/description.md
@@ -0,0 +1,5 @@
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running <model name> <mode> using
+Intel(R) Extension for PyTorch with GPU.
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/docker.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/docker.md
new file mode 100644
index 000000000..b648b28e4
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/docker.md
@@ -0,0 +1,52 @@
+<!--- 60. Docker -->
+## Docker
+
+Requirements:
+* Host machine has Intel® Data Center GPU Flex Series.
+* Host machine has the Intel® Data Center GPU Flex Series Ubuntu driver. Please follow the [link](https://registrationcenter.intel.com/en/products/download/4125/) to download.
+* Host machine has Docker installed
+* Download and build the Intel® Extension for PyTorch (IPEX) container using the [link](https://registrationcenter.intel.com/en/products/subscription/956/).
+  (`model-zoo:pytorch-ipex-gpu`)
+
+Prior to building the <model name> <mode> container, ensure that you have
+built the IPEX container (`model-zoo:pytorch-ipex-gpu`).
+
+[Extract the package](#model-package), then use the `build.sh`
+script to build the container. After the container has been built, you can
+run the model <mode> using the `run.sh` script.
+
+The `run.sh` script will execute one of the [quickstart](#quick-start-scripts) script
+using the container that was just built. By default, the
+`inference_with_dummy_data.sh` script will be run. To run a different script,
+specify the script name of the quickstart script using the `SCRIPT`
+environment variable. See the snippet below for an example.
+
+> Note: Ensure that your system has the proxy environment variables
+> set (if needed), otherwise the container build may fail when trying to pull external
+> dependencies (like apt-get and pip installs). 
+
+You need download pretrained weights from:
+yolov4.pth(https://pan.baidu.com/s/1ZroDvoGScDgtE1ja_QqJVw Extraction code:xrq9) or
+yolov4.pth(https://drive.google.com/open?id=1wv_LiFeCRYwtpkqREPeI13-gPELBDwuJ)
+to the anywhere directory you choice, and set environment 'PRETRAINED_MODEL'.
+```
+# Extract the package
+tar -xzf <package name>
+
+# Navigate to the YOLOv4 inference directory
+cd pytorch-gpu-YOLOv4-inference
+
+# Build the container
+./build.sh
+
+# Set environment vars for pretrained model
+export PRETRAINED_MODEL=<Path of the downloaded pretrained-model file on the machine >
+
+# Run the container with the default inference_with_dummy_data.sh script
+./run.sh
+
+For inference_with_dummy_data.sh environment is enough, but for inference.sh also need:
+export image=<path to the image .jpg file>
+export size=<Image size>
+export PRECISION=int8
+```
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/gpu_setup.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/gpu_setup.md
new file mode 100644
index 000000000..46a1c07ec
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/gpu_setup.md
@@ -0,0 +1,30 @@
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Flex Series
+
+## Software Requirements:
+- Ubuntu 20.04 (64-bit)
+- Intel GPU Drivers: Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html)
+
+  |Release|OS|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|-|
+    |v1.0.0|Ubuntu 20.04|Intel® Data Center GPU Flex Series| Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal-dc.html) for latest driver installation. If install the verified Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html), please append the specific version after components, such as `apt-get install intel-opencl-icd=22.28.23726.1+i419~u20.04`|
+
+- Intel® oneAPI Base Toolkit 2022.3: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler and oneMKL in Ubuntu 20.04.
+
+    ```bash
+    wget https://registrationcenter-download.intel.com/akdlm/irc_nas/18852/l_BaseKit_p_2022.3.0.8767_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, Threading Building Blocks and oneMKL
+    sh ./l_BaseKit_p_2022.3.0.8767_offline.sh
+    ```
+    For any more details, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html.
+
+  - Set environment variables: 
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/setvars.sh
+    ```
+
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/license.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/license.md
new file mode 100644
index 000000000..a60f169f4
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/license.md
@@ -0,0 +1,4 @@
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/package.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/package.md
new file mode 100644
index 000000000..c2317f9ba
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/package.md
@@ -0,0 +1,21 @@
+<!--- 20. Model package -->
+## Model Package
+
+The model package includes the scripts and libraries needed to
+build and run <model name> <mode> using a docker container. Note that
+this model container uses the PyTorch IPEX GPU container as it's base,
+and it requires the `model-zoo:pytorch-ipex-gpu` image to be built before
+the model container is built.
+```
+<package dir>
+├── build.sh
+├── info.txt
+├── licenses
+│   ├── LICENSE
+│   └── third_party
+├── model_packages
+│   └── <package name>
+├── <package dir>.Dockerfile
+├── README.md
+└── run.sh
+```
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/quickstart.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/quickstart.md
new file mode 100644
index 000000000..9c0ec26c1
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/quickstart.md
@@ -0,0 +1,6 @@
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `inference_with_dummy_data.sh` | Inference with int8 batch_size64 dummy data |
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/title.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/title.md
new file mode 100644
index 000000000..1d6c60187
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/title.md
@@ -0,0 +1,2 @@
+<!--- 0. Title -->
+# <model name> <mode>
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/README.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/README.md
new file mode 100644
index 000000000..87b1ba72f
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/README.md
@@ -0,0 +1,128 @@
+<!--- 0. Title -->
+# YOLOv4 inference
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running YOLOv4 inference using
+Intel(R) Extension for PyTorch with GPU.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Flex Series
+
+## Software Requirements:
+- Ubuntu 20.04 (64-bit)
+- Intel GPU Drivers: Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html)
+
+  |Release|OS|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|-|
+    |v1.0.0|Ubuntu 20.04|Intel® Data Center GPU Flex Series| Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal-dc.html) for latest driver installation. If install the verified Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html), please append the specific version after components, such as `apt-get install intel-opencl-icd=22.28.23726.1+i419~u20.04`|
+
+- Intel® oneAPI Base Toolkit 2022.3: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler and oneMKL in Ubuntu 20.04.
+
+    ```bash
+    wget https://registrationcenter-download.intel.com/akdlm/irc_nas/18852/l_BaseKit_p_2022.3.0.8767_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, Threading Building Blocks and oneMKL
+    sh ./l_BaseKit_p_2022.3.0.8767_offline.sh
+    ```
+    For any more details, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html.
+
+  - Set environment variables: 
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/setvars.sh
+    ```
+
+
+<!--- 30. Datasets -->
+## Datasets
+
+Download and extract the 2017 training/validation images and annotations from the
+[COCO dataset website](https://cocodataset.org/#download) to a `coco` folder
+and unzip the files. After extracting the zip files, your dataset directory
+structure should look something like this:
+```
+coco
+├── annotations
+│   ├── captions_train2017.json
+│   ├── captions_val2017.json
+│   ├── instances_train2017.json
+│   ├── instances_val2017.json
+│   ├── person_keypoints_train2017.json
+│   └── person_keypoints_val2017.json
+├── train2017
+│   ├── 000000454854.jpg
+│   ├── 000000137045.jpg
+│   ├── 000000129582.jpg
+│   └── ...
+└── val2017
+    ├── 000000000139.jpg
+    ├── 000000000285.jpg
+    ├── 000000000632.jpg
+    └── ...
+```
+The parent of the `annotations`, `train2017`, and `val2017` directory (in this example `coco`)
+is the directory that should be used when setting the `image` environment
+variable for YOLOv4 (for example: `export image=/home/<user>/coco/val2017/000000581781.jpg`).
+In addition, we should also set the `size` environment to match the size of image.
+(for example: `export size=416`)
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `inference_with_dummy_data.sh` | Inference with int8 batch_size64 dummy data |
+
+<!--- 50. Baremetal -->
+## Run the model
+Install the following pre-requisites:
+* Python version 3.9
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install PyTorch and Intel® Extension for PyTorch for GPU (IPEX):
+  ```bash
+  python -m pip install torch==1.10.0a0 -f https://developer.intel.com/ipex-whl-stable-xpu
+  python -m pip install numpy==1.23.4
+  python -m pip install intel_extension_for_pytorch==1.10.200+gpu -f https://developer.intel.com/ipex-whl-stable-xpu
+  ```
+  To verify that PyTorch and IPEX are correctly installed:
+  ```bash
+  python -c "import torch;print(torch.device('xpu'))"  # Sample output: "xpu"
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.is_available())"  #Sample output True
+  python -c "import intel_extension_for_pytorch as ipex;print(ipex.xpu.has_onemkl())"  # Sample output: True
+  ```
+* Navigate to Yolov4 inference directory and install model specific dependencies for the workload:
+  ```bash
+  cd quickstart/object_detection/pytorch/yolov4/inference/gpu
+  ./setup.sh
+  cd -
+  ```
+* Download the pretrained weights file, and set the PRETRAINED_MODEL environment variable to point to where it was saved:
+  ```bash
+  wget https://drive.google.com/open?id=1wv_LiFeCRYwtpkqREPeI13-gPELBDwuJ
+  ```
+
+### Run the model on Baremetal
+```
+Set environment variables:
+export DATASET_DIR=<path where yolov4 COCO dataset>
+export PRETRAINED_MODEL=<path to directory where the pretrained weights file was saved>
+export OUTPUT_DIR=<Path to save the output logs>
+
+Run the inference script, only int8 precision is supported:
+./quickstart/object_detection/pytorch/yolov4/inference/gpu/inference_with_dummy_data.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/build.sh b/quickstart/object_detection/pytorch/yolov4/inference/gpu/build.sh
new file mode 100755
index 000000000..7b6fa1a6e
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/build.sh
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE:-intel/intel-extension-for-pytorch}
+PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG:-gpu}
+IMAGE_NAME=${IMAGE_NAME:-model-zoo:pytorch-atsm-yolov4-inference}
+
+if [ "$(docker images -q ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG})" == "" ]; then
+  echo "The Intel(R) Extension for PyTorch container (${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}) was not found."
+  echo "This container is required, as it is used as the base for building the YOLOv4 inference container."
+  echo "Please download the IPEX container package and build the image and then retry this build."
+  exit 1
+fi
+
+docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=pytorch-atsm-yolov4-inference \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg PYTORCH_BASE_IMAGE=${PYTORCH_BASE_IMAGE} \
+    --build-arg PYTORCH_BASE_TAG=${PYTORCH_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f pytorch-atsm-yolov4-inference.Dockerfile .
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/devcatalog.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/devcatalog.md
new file mode 100644
index 000000000..f7d69a12f
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/devcatalog.md
@@ -0,0 +1,109 @@
+# Running YOLOv4 inference on Intel® Data Center GPU Flex Series using Intel® Extension for PyTorch*
+
+
+## Overview
+
+This document has instructions for running YOLOv4 inference using
+Intel(R) Extension for PyTorch with GPU.
+
+## Requirements
+| Item | Detail |
+| ------ | ------- |
+| Host machine  | Intel® Data Center GPU Flex Series  |
+| Drivers | GPU-compatible drivers need to be installed:[Download Driver 476.14](https://dgpu-docs.intel.com/releases/stable_476_14_20221021.html)
+| Software | Docker* Installed |
+
+## Download Datasets
+
+Download and extract the 2017 training/validation images and annotations from the
+[COCO dataset website](https://cocodataset.org/#download) to a `coco` folder
+and unzip the files. After extracting the zip files, your dataset directory
+structure should look something like this:
+```
+coco
+├── annotations
+│   ├── captions_train2017.json
+│   ├── captions_val2017.json
+│   ├── instances_train2017.json
+│   ├── instances_val2017.json
+│   ├── person_keypoints_train2017.json
+│   └── person_keypoints_val2017.json
+├── train2017
+│   ├── 000000454854.jpg
+│   ├── 000000137045.jpg
+│   ├── 000000129582.jpg
+│   └── ...
+└── val2017
+    ├── 000000000139.jpg
+    ├── 000000000285.jpg
+    ├── 000000000632.jpg
+    └── ...
+```
+The parent of the `annotations`, `train2017`, and `val2017` directory (in this example `coco`)
+is the directory that should be used when setting the `image` environment
+variable for YOLOv4 (for example: `export image=/home/<user>/coco/val2017/000000581781.jpg`).
+In addition, we should also set the `size` environment to match the size of image.
+(for example: `export size=416`)
+
+## Pretrained Model
+
+You need to download pretrained weights from: yolov4.pth(https://pan.baidu.com/s/1ZroDvoGScDgtE1ja_QqJVw Extraction code:xrq9) or yolov4.pth(https://drive.google.com/open?id=1wv_LiFeCRYwtpkqREPeI13-gPELBDwuJ) to the any directory of your choice, and set environment `PRETRAINED_MODEL`.
+
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `inference_with_dummy_data.sh` | Inference with int8 batch_size 64 on dummy data |
+
+## Run Using Docker
+
+### Set up Docker Image
+
+```
+docker pull intel/image-recognition:pytorch-flex-gpu-yolov4-inference
+```
+### Run Docker Image
+The Yolov4 inference container includes scripts,model and libraries need to run int8 inference. To run the `inference_with_dummy_data.sh` quickstart script using this container, you'll need to provide volume mounts for the COCO dataset. You will need to provide an output directory where log files will be written. 
+
+```
+export PRECISION=int8
+export OUTPUT_DIR=<path to output directory>
+export DATASET_DIR=<path to the preprocessed coco dataset>
+export SCRIPT=quickstart/inference_with_dummy_data.sh
+export PRETRAINED_MODEL=<path to downloaded yolov4 model>
+
+DOCKER_ARGS=${DOCKER_ARGS:---rm -it}
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --env PRECISION=${PRECISION} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume ${PRETRAINED_MODEL}:${PRETRAINED_MODEL} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  ${DOCKER_ARGS} \
+  ${IMAGE_NAME} \
+  /bin/bash $SCRIPT
+  ```
+
+## Documentation and Sources
+
+[GitHub* Repository](https://github.com/IntelAI/models/tree/master/dockerfiles/model_containers)
+
+## Support
+Support for Intel® Extension for PyTorch* is found via the [Intel® AI Analytics Toolkit.](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html#gs.qbretz) Additionally, the Intel® Extension for PyTorch* team tracks both bugs and enhancement requests using [GitHub issues](https://github.com/intel/intel-extension-for-pytorch/issues). Before submitting a suggestion or bug report, please search the GitHub issues to see if your issue has already been reported.
+
+## License Agreement
+
+LEGAL NOTICE: By accessing, downloading or using this software and any required dependent software (the “Software Package”), you agree to the terms and conditions of the software license agreements for the Software Package, which may also include notices, disclaimers, or license terms for third party software included with the Software Package. Please refer to the [license file](https://github.com/IntelAI/models/tree/master/third_party) for additional details.
\ No newline at end of file
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/inference_with_dummy_data.sh b/quickstart/object_detection/pytorch/yolov4/inference/gpu/inference_with_dummy_data.sh
new file mode 100755
index 000000000..ef0a80050
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/inference_with_dummy_data.sh
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+./quickstart/setup.sh
+
+if [[ -z "${PRETRAINED_MODEL}" ]]; then
+  echo "The required environment variable PRETRAINED_MODEL has not been set."
+  echo "Please specify a directory where the model weights were downloaded"
+  exit 1
+fi
+
+echo "YOLOv4 dummy data int8 inference block nchw"
+IPEX_XPU_ONEDNN_LAYOUT=1 python -u ${MODEL_DIR}/models/object_detection/pytorch/yolov4/inference/gpu/models.py \
+  -n 80 \
+  --weight ${PRETRAINED_MODEL} \
+  -e 416 \
+  -w 416 \
+  -name ${MODEL_DIR}/models/object_detection/pytorch/yolov4/inference/gpu/data/coco.names \
+  -d int8 \
+  --dummy 1 \
+  -b 64 \
+  --benchmark 1 \
+  --iter 500 2>&1 | tee $OUTPUT_DIR/YOLOv4_dummy_data_xpu_inf.log
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/requirements.txt b/quickstart/object_detection/pytorch/yolov4/inference/gpu/requirements.txt
new file mode 100644
index 000000000..ad261be4a
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/requirements.txt
@@ -0,0 +1,7 @@
+matplotlib==2.2.3
+tqdm==4.43.0
+easydict==1.9
+Pillow>=9.3.0
+scikit-image
+opencv_python
+pycocotools
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/run.sh b/quickstart/object_detection/pytorch/yolov4/inference/gpu/run.sh
new file mode 100755
index 000000000..408f494d1
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/run.sh
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+if [ -z "${PRETRAINED_MODEL}" ]; then
+  echo "The required environment variable PRETRAINED_MODEL has not been set"
+  exit 1
+fi
+
+IMAGE_NAME=${IMAGE_NAME:-model-zoo:pytorch-gpu-yolov4-inference}
+DOCKER_ARGS=${DOCKER_ARGS:---rm -it}
+
+# inference scripts:
+# inference_with_dummy_data.sh
+export SCRIPT="${SCRIPT:-inference_with_dummy_data.sh}"
+
+# The dataset directory is not required for the synthetic data script
+if [[ ${SCRIPT} != inference_with_dummy_data.sh ]]; then
+  if [ -z "${image}" ]; then
+    echo "The required environment variable image for dataset has not been set"
+    exit 1
+  fi
+fi
+
+if [[ ${SCRIPT} != quickstart* ]]; then
+  SCRIPT="quickstart/$SCRIPT"
+fi
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --privileged \
+  --env image=${image} \
+  --env PRETRAINED_MODEL=${PRETRAINED_MODEL} \
+  --env size=${size} \
+  --env PRECISION=${PRECISION} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  --volume ${PRETRAINED_MODEL}:${PRETRAINED_MODEL} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  ${DOCKER_ARGS} \
+  $IMAGE_NAME \
+  /bin/bash $SCRIPT
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/setup.sh b/quickstart/object_detection/pytorch/yolov4/inference/gpu/setup.sh
new file mode 100755
index 000000000..a80ce7678
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/setup.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+
+pip install matplotlib==2.2.3
+pip install tqdm==4.43.0
+pip install easydict==1.9
+pip install Pillow>=9.3.0
+pip install scikit-image
+pip install opencv-python-headless
+pip install pycocotools
diff --git a/quickstart/object_detection/pytorch/yolov4/inference/gpu/wrapper_README.md b/quickstart/object_detection/pytorch/yolov4/inference/gpu/wrapper_README.md
new file mode 100644
index 000000000..f4a55c866
--- /dev/null
+++ b/quickstart/object_detection/pytorch/yolov4/inference/gpu/wrapper_README.md
@@ -0,0 +1,129 @@
+<!--- 0. Title -->
+# YOLOv4 inference
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running YOLOv4 inference using
+Intel(R) Extension for PyTorch with GPU.
+
+<!--- 20. Model package -->
+## Model Package
+
+The model package includes the scripts and libraries needed to
+build and run YOLOv4 inference using a docker container. Note that
+this model container uses the PyTorch IPEX GPU container as it's base,
+and it requires the `model-zoo:pytorch-ipex-gpu` image to be built before
+the model container is built.
+```
+pytorch-gpu-yolov4-inference
+├── build.sh
+├── info.txt
+├── licenses
+│   ├── LICENSE
+│   └── third_party
+├── model_packages
+│   └── pytorch-gpu-yolov4-inference.tar.gz
+├── pytorch-gpu-yolov4-inference.Dockerfile
+├── README.md
+└── run.sh
+```
+
+<!--- 30. Datasets -->
+## Datasets
+
+Download and extract the 2017 training/validation images and annotations from the
+[COCO dataset website](https://cocodataset.org/#download) to a `coco` folder
+and unzip the files. After extracting the zip files, your dataset directory
+structure should look something like this:
+```
+coco
+├── annotations
+│   ├── captions_train2017.json
+│   ├── captions_val2017.json
+│   ├── instances_train2017.json
+│   ├── instances_val2017.json
+│   ├── person_keypoints_train2017.json
+│   └── person_keypoints_val2017.json
+├── train2017
+│   ├── 000000454854.jpg
+│   ├── 000000137045.jpg
+│   ├── 000000129582.jpg
+│   └── ...
+└── val2017
+    ├── 000000000139.jpg
+    ├── 000000000285.jpg
+    ├── 000000000632.jpg
+    └── ...
+```
+The parent of the `annotations`, `train2017`, and `val2017` directory (in this example `coco`)
+is the directory that should be used when setting the `image` environment
+variable for YOLOv4 (for example: `export image=/home/<user>/coco/val2017/000000581781.jpg`).
+In addition, we should also set the `size` environment to match the size of image.
+(for example: `export size=416`)
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|-------------|-------------|
+| `inference_with_dummy_data.sh` | Inference with int8 batch_size64 dummy data |
+
+<!--- 60. Docker -->
+## Docker
+
+Requirements:
+* Host machine has Intel® Data Center GPU Flex Series.
+* Host machine has the Intel® Data Center GPU Flex Series Ubuntu driver. Please follow the [link](https://registrationcenter.intel.com/en/products/download/4125/) to download.
+* Host machine has Docker installed
+* Download and build the Intel® Extension for PyTorch (IPEX) container using the [link](https://registrationcenter.intel.com/en/products/subscription/956/).
+  (`model-zoo:pytorch-ipex-gpu`)
+
+Prior to building the YOLOv4 inference container, ensure that you have
+built the IPEX container (`model-zoo:pytorch-ipex-gpu`).
+
+[Extract the package](#model-package), then use the `build.sh`
+script to build the container. After the container has been built, you can
+run the model inference using the `run.sh` script.
+
+The `run.sh` script will execute one of the [quickstart](#quick-start-scripts) script
+using the container that was just built. By default, the
+`inference_with_dummy_data.sh` script will be run. To run a different script,
+specify the script name of the quickstart script using the `SCRIPT`
+environment variable. See the snippet below for an example.
+
+> Note: Ensure that your system has the proxy environment variables
+> set (if needed), otherwise the container build may fail when trying to pull external
+> dependencies (like apt-get and pip installs). 
+
+You need download pretrained weights from:
+yolov4.pth(https://pan.baidu.com/s/1ZroDvoGScDgtE1ja_QqJVw Extraction code:xrq9) or
+yolov4.pth(https://drive.google.com/open?id=1wv_LiFeCRYwtpkqREPeI13-gPELBDwuJ)
+to the anywhere directory you choice, and set environment 'PRETRAINED_MODEL'.
+```
+# Extract the package
+tar -xzf pytorch-gpu-yolov4-inference.tar.gz
+
+# Navigate to the YOLOv4 inference directory
+cd pytorch-gpu-YOLOv4-inference
+
+# Build the container
+./build.sh
+
+# Set environment vars for pretrained model
+export PRETRAINED_MODEL=<Path of the downloaded pretrained-model file on the machine >
+
+# Run the container with the default inference_with_dummy_data.sh script
+./run.sh
+
+For inference_with_dummy_data.sh environment is enough, but for inference.sh also need:
+export image=<path to the image .jpg file>
+export size=<Image size>
+export PRECISION=int8
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/baremetal.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/baremetal.md
new file mode 100644
index 000000000..0ff4ad99c
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/baremetal.md
@@ -0,0 +1,46 @@
+<!--- 50. AI Kit -->
+## Run the model
+Install the following pre-requisites:
+* Python version 3.9
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install TensorFlow and Intel® Extension for TensorFlow (ITEX):
+
+  Intel® Extension for TensorFlow requires stock TensorFlow v2.10.0 to be installed 
+  
+  ```bash
+  pip install tensorflow==2.10.0
+  pip install --upgrade intel-extension-for-tensorflow[gpu]
+  ```
+   To verify that TensorFlow and ITEX are correctly installed:
+  ```
+  python -c "import intel_extension_for_tensorflow as itex; print(itex.__version__)"
+  ```
+* Download the frozen graph model file, and set the FROZEN_GRAPH environment variable to point to where it was saved:
+  ```bash
+  wget https://storage.googleapis.com/intel-optimized-tensorflow/models/gpu/ssd_mobilenet_v1_int8_itex.pb
+  ```
+* Install model specific dependencies:
+  ```bash
+  pip install pycocotools
+  ```
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading and preprocessing the ImageNet dataset. The path to the ImageNet
+TF records files will need to be set as the `DATASET_DIR` environment variable
+prior to running a [quickstart script](#quick-start-scripts).
+
+### To run the model on Baremetal
+This snippet shows how to run a quickstart script:
+```
+export DATASET_DIR=<path to the preprocessed COCO TF dataset>
+export OUTPUT_DIR=<path to where output log files will be written>
+export PRECISION=int8
+export FROZEN_GRAPH=<path to pretrained model file (*.pb)>
+
+Run quickstart script:
+./quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/<script name>.sh
+```
\ No newline at end of file
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/datasets.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/datasets.md
new file mode 100644
index 000000000..f9e044408
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/datasets.md
@@ -0,0 +1,8 @@
+<!--- 30. Datasets -->
+## Datasets
+
+Download and preprocess the COCO dataset using the [instructions here](/datasets/coco/README.md).
+After running the conversion script you should have a directory with the
+COCO dataset in the TF records format.
+
+Set the `DATASET_DIR` to point to the TF records directory when running <model name>.
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/description.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/description.md
new file mode 100644
index 000000000..28dba6026
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/description.md
@@ -0,0 +1,5 @@
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running <model name> <mode> using
+Intel(R) Extension for TensorFlow with <device>.
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/docker.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/docker.md
new file mode 100644
index 000000000..11e42df15
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/docker.md
@@ -0,0 +1,46 @@
+<!--- 60. Docker -->
+## Docker
+
+Requirements:
+* Host machine has Intel(R) Data Center GPU Flex Series.
+* GPU-compatible drivers need to be installed. Please follow the [link](https://registrationcenter.intel.com/en/products/download/4125/) to download.
+* Docker
+* Download and build the [Intel® Extension for TensorFlow (ITEX) container](https://registrationcenter.intel.com/en/products/subscription/956/)(`model-zoo:tensorflow-itex-gpu`)
+
+After extracting the <package name>, use the `build.sh`
+script to build the container. After the container has been built, you can
+run the model <mode> using the `run.sh` script. Set environment variables
+for DATASET_DIR to the path to COCO TF records files,if you do not set it 
+dummy data will be used. However, for accuracy you have to provide the real dataset.
+Set OUTPUT_DIR to an output directory where log files will be written. 
+
+The `run.sh` script will execute one of the [quickstart](#quickstart) script
+using the container that was just built. By default, the
+`quickstart/online_inference.sh` script will be run. To change which
+script gets run, either edit the `run.sh` file, or specify the name of file
+to run using the `SCRIPT` environment variable.
+See the snippet below for an example on how to do this.
+
+> Note: Ensure that your system has the proxy environment variables
+> set, otherwise the container build may fail when trying to pull external
+> dependencies (like apt-get and pip installs).
+
+```
+#Extract the package
+tar -xzf <package name>
+cd <package dir>
+
+# Build the container
+./build.sh
+
+#Set environment variables
+export PRECISION=int8
+export DATASET_DIR=<path to the dataset>
+export OUTPUT_DIR=<directory where log files will be written>
+
+# Run the container with the default online_inference.sh script
+./run.sh
+
+# Specify a different quickstart script to run
+SCRIPT=quickstart/batch_inference.sh ./run.sh
+```
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/gpu_setup.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/gpu_setup.md
new file mode 100644
index 000000000..f3311e42e
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/gpu_setup.md
@@ -0,0 +1,29 @@
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Flex Series
+
+## Software Requirements:
+- Ubuntu 20.04 (64-bit)
+- Intel GPU Drivers: Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html)
+
+  |Release|OS|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|-|
+    |v1.0.0|Ubuntu 20.04|Intel® Data Center GPU Flex Series| Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal-dc.html) for latest driver installation. If install the verified Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html), please append the specific version after components, such as `apt-get install intel-opencl-icd=22.28.23726.1+i419~u20.04`|
+
+- Intel® oneAPI Base Toolkit 2022.3: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler and oneMKL in Ubuntu 20.04.
+
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/18852/l_BaseKit_p_2022.3.0.8767_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, Threading Building Blocks and oneMKL
+    $ sh ./l_BaseKit_p_2022.3.0.8767_offline.sh
+    ```
+    For any more details, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html.
+
+  - Set environment variables
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/setvars.sh
+    ```
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/license.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/license.md
new file mode 100644
index 000000000..a60f169f4
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/license.md
@@ -0,0 +1,4 @@
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/package.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/package.md
new file mode 100644
index 000000000..07471ed4a
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/package.md
@@ -0,0 +1,26 @@
+<!--- 20. Model package -->
+## Model Package
+
+The model package includes the scripts and libraries needed to
+build and run <model name> <mode> using a docker container. Note that
+this model container uses the Tensorflow ITEX GPU container as it's base,
+and it requires the `model-zoo:tensorflow-itex-gpu` image to be built before
+the model container is built.
+```
+<package dir>
+├── build.sh
+├── info.txt
+├── licenses
+│   ├── LICENSE
+│   └── third_party
+│       ├── Intel_Model_Zoo_v2.0_Container_tpps.txt
+│       ├── Intel_Model_Zoo_v2.0_ML_Container_tpps.txt
+│       ├── Intel_Model_Zoo_v2.3_PyTorch.txt
+│       └── licenses.txt
+├── model_packages
+│   └── <package name>
+├── README.md
+├── run.sh
+└── <package dir>.Dockerfile
+```
+
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/quickstart.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/quickstart.md
new file mode 100644
index 000000000..48fec10a8
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/quickstart.md
@@ -0,0 +1,8 @@
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|:-------------:|:-------------:|
+| [`online_inference.sh`](online_inference.sh) | Runs online inference for int8 precision |
+| [`batch_inference.sh`](batch_inference.sh) | Runs batch inference for int8 precision |
+| [`accuracy.sh`](accuracy.sh) | Measures the model accuracy for int8 precision  |
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/title.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/title.md
new file mode 100644
index 000000000..1d6c60187
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/title.md
@@ -0,0 +1,2 @@
+<!--- 0. Title -->
+# <model name> <mode>
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/README.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/README.md
new file mode 100644
index 000000000..e0e118d9a
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/README.md
@@ -0,0 +1,108 @@
+<!--- 0. Title -->
+# SSD-MobileNet inference
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running SSD-MobileNet inference using
+Intel(R) Extension for TensorFlow with Intel(R) Data Center GPU Flex Series.
+
+<!--- 20. GPU Setup -->
+## Hardware Requirements:
+- Intel® Data Center GPU Flex Series
+
+## Software Requirements:
+- Ubuntu 20.04 (64-bit)
+- Intel GPU Drivers: Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html)
+
+  |Release|OS|Intel GPU|Install Intel GPU Driver|
+    |-|-|-|-|
+    |v1.0.0|Ubuntu 20.04|Intel® Data Center GPU Flex Series| Refer to the [Installation Guides](https://dgpu-docs.intel.com/installation-guides/ubuntu/ubuntu-focal-dc.html) for latest driver installation. If install the verified Intel® Data Center GPU Flex Series [419.40](https://dgpu-docs.intel.com/releases/stable_419_40_20220914.html), please append the specific version after components, such as `apt-get install intel-opencl-icd=22.28.23726.1+i419~u20.04`|
+
+- Intel® oneAPI Base Toolkit 2022.3: Need to install components of Intel® oneAPI Base Toolkit
+  - Intel® oneAPI DPC++ Compiler
+  - Intel® oneAPI Math Kernel Library (oneMKL)
+  * Download and install the verified DPC++ compiler and oneMKL in Ubuntu 20.04.
+
+    ```bash
+    $ wget https://registrationcenter-download.intel.com/akdlm/irc_nas/18852/l_BaseKit_p_2022.3.0.8767_offline.sh
+    # 4 components are necessary: DPC++/C++ Compiler, DPC++ Libiary, Threading Building Blocks and oneMKL
+    $ sh ./l_BaseKit_p_2022.3.0.8767_offline.sh
+    ```
+    For any more details, please follow the procedure in https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit.html.
+
+  - Set environment variables
+    Default installation location {ONEAPI_ROOT} is /opt/intel/oneapi for root account, ${HOME}/intel/oneapi for other accounts
+    ```bash
+    source {ONEAPI_ROOT}/setvars.sh
+    ```
+
+<!--- 30. Datasets -->
+## Datasets
+
+Download and preprocess the COCO dataset using the [instructions here](/datasets/coco/README.md).
+After running the conversion script you should have a directory with the
+COCO dataset in the TF records format.
+
+Set the `DATASET_DIR` to point to the TF records directory when running SSD-MobileNet.
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|:-------------:|:-------------:|
+| [`online_inference.sh`](online_inference.sh) | Runs online inference for int8 precision |
+| [`batch_inference.sh`](batch_inference.sh) | Runs batch inference for int8 precision |
+| [`accuracy.sh`](accuracy.sh) | Measures the model accuracy for int8 precision  |
+
+<!--- 50. AI Kit -->
+## Run the model
+Install the following pre-requisites:
+* Python version 3.9
+* Create and activate virtual environment.
+  ```bash
+  virtualenv -p python <virtualenv_name>
+  source <virtualenv_name>/bin/activate
+  ```
+* Install TensorFlow and Intel® Extension for TensorFlow (ITEX):
+
+  Intel® Extension for TensorFlow requires stock TensorFlow v2.10.0 to be installed 
+  
+  ```bash
+  pip install tensorflow==2.10.0
+  pip install --upgrade intel-extension-for-tensorflow[gpu]
+  ```
+   To verify that TensorFlow and ITEX are correctly installed:
+  ```
+  python -c "import intel_extension_for_tensorflow as itex; print(itex.__version__)"
+  ```
+* Download the frozen graph model file, and set the FROZEN_GRAPH environment variable to point to where it was saved:
+  ```bash
+  wget https://storage.googleapis.com/intel-optimized-tensorflow/models/gpu/ssd_mobilenet_v1_int8_itex.pb
+  ```
+* Install model specific dependencies:
+  ```bash
+  pip install pycocotools
+  ```
+
+See the [datasets section](#datasets) of this document for instructions on
+downloading and preprocessing the ImageNet dataset. The path to the ImageNet
+TF records files will need to be set as the `DATASET_DIR` environment variable
+prior to running a [quickstart script](#quick-start-scripts).
+
+### To run the model on Baremetal
+This snippet shows how to run a quickstart script:
+```
+export DATASET_DIR=<path to the preprocessed COCO TF dataset>
+export OUTPUT_DIR=<path to where output log files will be written>
+export PRECISION=int8
+export FROZEN_GRAPH=<path to pretrained model file (*.pb)>
+
+Run quickstart script:
+./quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/<script name>.sh
+```
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/accuracy.sh b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/accuracy.sh
new file mode 100755
index 000000000..2e0ff28aa
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/accuracy.sh
@@ -0,0 +1,74 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+MODEL_DIR=${MODEL_DIR-$PWD}
+BATCH_SIZE=${BATCH_SIZE-32}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'PRECISION='$PRECISION
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo 'DATASET_DIR='$DATASET_DIR
+
+if [[ ! -f "${FROZEN_GRAPH}" ]]; then
+  pretrained_model=/workspace/tf-atsm-ssd-mobilenet-inference/pretrained_models/ssdmobilenet_${PRECISION}_pretrained_model_gpu.pb
+else
+  pretrained_model=${FROZEN_GRAPH}
+fi
+
+export TF_NUM_INTEROP_THREADS=1
+export DATA_NUM_INTER_THREADS=1
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_envs
+input_envs[PRECISION]=${PRECISION}
+input_envs[OUTPUT_DIR]=${OUTPUT_DIR}
+input_envs[DATASET_DIR]=${DATASET_DIR}
+
+for i in "${!input_envs[@]}"; do
+  var_name=$i
+  env_param=${input_envs[$i]}
+ 
+  if [[ -z $env_param ]]; then
+    echo "The required environment variable $var_name is not set" >&2
+    exit 1
+  fi
+done
+
+if [[ $PRECISION != "int8" ]]; then
+  echo "ATS-M GPU SUPPORTS ONLY INT8 PRECISION"
+  exit 1
+fi
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+
+if [ ! -d "${DATASET_DIR}" ]; then
+  echo "The DATASET_DIR '${DATASET_DIR}' does not exist"
+  exit 1
+fi
+
+source "${MODEL_DIR}/quickstart/common/utils.sh"
+_command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+    --data-location ${DATASET_DIR}/coco_val.record \
+    --in-graph ${pretrained_model} \
+    --output-dir ${OUTPUT_DIR} \
+    --model-name ssd-mobilenet \
+    --framework tensorflow \
+    --precision int8 \
+    --mode inference \
+    --accuracy-only \
+    --batch-size ${BATCH_SIZE} \
+    --gpu \
+    $@
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/batch_inference.sh b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/batch_inference.sh
new file mode 100755
index 000000000..e9a880520
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/batch_inference.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+BATCH_SIZE=${BATCH_SIZE-1024}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'PRECISION='$PRECISION
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+
+if [[ ! -f "${FROZEN_GRAPH}" ]]; then
+  pretrained_model=/workspace/tf-atsm-ssd-mobilenet-inference/pretrained_models/ssdmobilenet_${PRECISION}_pretrained_model_gpu.pb
+else
+  pretrained_model=${FROZEN_GRAPH}
+fi
+
+export TF_NUM_INTEROP_THREADS=1
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_envs
+input_envs[PRECISION]=${PRECISION}
+input_envs[OUTPUT_DIR]=${OUTPUT_DIR}
+
+for i in "${!input_envs[@]}"; do
+  var_name=$i
+  env_param=${input_envs[$i]}
+ 
+  if [[ -z $env_param ]]; then
+    echo "The required environment variable $var_name is not set" >&2
+    exit 1
+  fi
+done
+
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+
+WARMUP=""
+if [[ $PRECISION == "int8" ]]; then
+  WARMUP="-- warmup_steps=5 steps=20"
+  else
+  echo "ATS-M GPU SUPPORTS ONLY INT8 PRECISION"
+  exit 1
+fi
+
+source "${MODEL_DIR}/quickstart/common/utils.sh"
+_command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+    --in-graph ${pretrained_model} \
+    --output-dir ${OUTPUT_DIR} \
+    ${DATASET_OPTION} \
+    --model-name ssd-mobilenet \
+    --framework tensorflow \
+    --precision ${PRECISION} \
+    --mode inference \
+    --benchmark-only \
+    --batch-size=${BATCH_SIZE} \
+    --gpu \
+    $@ \
+    ${WARMPUP} 
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/build.sh b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/build.sh
new file mode 100755
index 000000000..6987c15ed
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/build.sh
@@ -0,0 +1,42 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+set -e
+
+TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE:-intel/intel-extension-for-tensorflow}
+TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG:-gpu}
+IMAGE_NAME=${IMAGE_NAME:-model-zoo:tf-atsm-ssd-mobilenet-inference}
+
+if [ "$(docker images -q ${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG})" == "" ]; then
+  echo "The Intel(R) Extension for Tensorflow container (${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG}) was not found."
+  echo "This container is required, as it is used as the base for building the SSD-Mobilenet inference container."
+  echo "Please download the ITEX container package and build the image and then retry this build."
+  exit 1
+fi
+
+docker build \
+    --build-arg PACKAGE_DIR=model_packages \
+    --build-arg PACKAGE_NAME=tf-atsm-ssd-mobilenet-inference \
+    --build-arg MODEL_WORKSPACE=/workspace \
+    --build-arg http_proxy=$http_proxy \
+    --build-arg https_proxy=$https_proxy \
+    --build-arg no_proxy=$no_proxy \
+    --build-arg TENSORFLOW_BASE_IMAGE=${TENSORFLOW_BASE_IMAGE} \
+    --build-arg TENSORFLOW_BASE_TAG=${TENSORFLOW_BASE_TAG} \
+    -t $IMAGE_NAME \
+    -f tf-atsm-ssd-mobilenet-inference.Dockerfile .
+
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/devcatalog.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/devcatalog.md
new file mode 100644
index 000000000..c8c70e975
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/devcatalog.md
@@ -0,0 +1,88 @@
+# Running SSD-MobileNet Inference on Intel® Data Center GPU Flex Series using Intel® Extension for TensorFlow*
+
+## Overview
+
+This document has instructions for running SSD-MobileNet inference using
+Intel(R) Extension for TensorFlow* with Intel(R) Data Center GPU Flex Series.
+
+## Requirements
+| Item | Detail |
+| ------ | ------- |
+| Host machine  | Intel® Data Center GPU Flex Series  |
+| Drivers | GPU-compatible drivers need to be installed:[Download Driver 476.14](https://dgpu-docs.intel.com/releases/stable_476_14_20221021.html)
+| Software | Docker* Installed |
+
+## Get Started
+
+## Download Datasets
+
+Download and preprocess the COCO dataset using the [instructions here](https://github.com/IntelAI/models/blob/master/datasets/coco/README.md).
+After running the conversion script you should have a directory with the
+COCO dataset in the TF records format.
+
+Set the `DATASET_DIR` to point to the TF records directory when running SSD-MobileNet.
+
+## Quick Start Scripts
+
+| Script name | Description |
+|:-------------:|:-------------:|
+| `online_inference` | Runs online inference for int8 precision | 
+| `batch_inference` | Runs batch inference for int8 precision |
+| `accuracy` | Measures the model accuracy for int8 precision |
+
+## Run Using Docker
+
+### Set up Docker Image
+
+```
+docker pull intel/object-detection:tf-flex-gpu-ssd-mobilenet-inference
+```
+### Run Docker Image
+The SSD-MobileNet inference container includes scripts,model and libraries need to run int8 inference. To run the inference quickstart scripts using this container, you'll need to provide volume mounts for the COCO dataset for running `accuracy.sh` script. For `online_inference.sh` and `batch_inference.sh` dummy dataset will be used. You will need to provide an output directory where log files will be written. 
+
+```
+export PRECISION=int8
+export OUTPUT_DIR=<path to output directory>
+export DATASET_DIR=<path to the preprocessed coco dataset>
+IMAGE_NAME=intel/object-detection:tf-flex-gpu-ssd-mobilenet-inference
+DOCKER_ARGS=${DOCKER_ARGS:---rm -it}
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  -v <your-local-dir>:/workspace \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --env PRECISION=${PRECISION} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  ${DOCKER_ARGS} \
+  ${IMAGE_NAME} \
+  /bin/bash quickstart/<script name>.sh
+```
+
+## Documentation and Sources
+
+[GitHub* Repository](https://github.com/IntelAI/models/tree/master/dockerfiles/model_containers)
+
+## Summary and Next Steps
+
+Now you are inside container with Python 3.9 and Tensorflow 2.10.0 preinstalled. You can run your own script
+to run on intel GPU. 
+
+## Support
+Support for Intel® Extension for TensorFlow* is found via the [Intel® AI Analytics Toolkit.](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html#gs.qbretz) Additionally, the Intel® Extension for TensorFlow* team tracks both bugs and enhancement requests using [GitHub issues](https://github.com/intel/intel-extension-for-tensorflow/issues). Before submitting a suggestion or bug report, please search the GitHub issues to see if your issue has already been reported.
+
+## License Agreement
+
+LEGAL NOTICE: By accessing, downloading or using this software and any required dependent software (the “Software Package”), you agree to the terms and conditions of the software license agreements for the Software Package, which may also include notices, disclaimers, or license terms for third party software included with the Software Package. Please refer to the [license file](https://github.com/IntelAI/models/tree/master/third_party) for additional details.
\ No newline at end of file
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/online_inference.sh b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/online_inference.sh
new file mode 100755
index 000000000..85bdfd35d
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/online_inference.sh
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2022 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+BATCH_SIZE=${BATCH_SIZE-1}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'PRECISION='$PRECISION
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo performance | sudo tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor
+
+if [[ ! -f "${FROZEN_GRAPH}" ]]; then
+  pretrained_model=/workspace/tf-atsm-ssd-mobilenet-inference/pretrained_models/ssdmobilenet_${PRECISION}_pretrained_model_gpu.pb
+else
+  pretrained_model=${FROZEN_GRAPH}
+fi
+
+export TF_NUM_INTEROP_THREADS=1
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_envs
+input_envs[PRECISION]=${PRECISION}
+input_envs[OUTPUT_DIR]=${OUTPUT_DIR}
+
+for i in "${!input_envs[@]}"; do
+  var_name=$i
+  env_param=${input_envs[$i]}
+ 
+  if [[ -z $env_param ]]; then
+    echo "The required environment variable $var_name is not set" >&2
+    exit 1
+  fi
+done
+
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+
+WARMUP=""
+if [[ $PRECISION == "int8" ]]; then
+  WARMUP="-- warmup_steps=5 steps=20"
+  else
+  echo "ATS-M GPU SUPPORTS ONLY INT8 PRECISION"
+  exit 1
+fi
+
+source "${MODEL_DIR}/quickstart/common/utils.sh"
+_command python ${MODEL_DIR}/benchmarks/launch_benchmark.py \
+    --in-graph ${pretrained_model} \
+    --output-dir ${OUTPUT_DIR} \
+    ${DATASET_OPTION} \
+    --model-name ssd-mobilenet \
+    --framework tensorflow \
+    --precision ${PRECISION} \
+    --mode inference \
+    --benchmark-only \
+    --batch-size 1 \
+    --gpu \
+    $@ \
+    ${WARMPUP}
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/run.sh b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/run.sh
new file mode 100755
index 000000000..6f42fba2d
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/run.sh
@@ -0,0 +1,93 @@
+#!/usr/bin/env bash
+#
+# Copyright (c) 2021 Intel Corporation
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+MODEL_DIR=${MODEL_DIR-$PWD}
+
+echo 'MODEL_DIR='$MODEL_DIR
+echo 'PRECISION='$PRECISION
+echo 'OUTPUT_DIR='$OUTPUT_DIR
+echo 'DATASET_DIR='$DATASET_DIR
+
+# Create an array of input directories that are expected and then verify that they exist
+declare -A input_envs
+input_envs[PRECISION]=${PRECISION}
+input_envs[OUTPUT_DIR]=${OUTPUT_DIR}
+
+for i in "${!input_envs[@]}"; do
+  var_name=$i
+  env_param=${input_envs[$i]}
+
+  if [[ -z $env_param ]]; then
+    echo "The required environment variable $var_name is not set" >&2
+    exit 1
+  fi
+done
+
+input_envs[DATASET_DIR]=${DATASET_DIR}
+
+if [[ $PRECISION != "int8" ]]; then
+  echo "INTEL(R) DATA CENTER GPU FLEX SERIES SUPPORTS ONLY INT8 PRECISION"
+  exit 1
+fi
+# Create the output directory in case it doesn't already exist
+mkdir -p ${OUTPUT_DIR}
+
+IMAGE_NAME=${IMAGE_NAME:-model-zoo:tf-gpu-ssd-mobilenet-inference}
+DOCKER_ARGS=${DOCKER_ARGS:---rm -it}
+
+# inference scripts:
+# online_inference.sh
+# batch_inference.sh 
+# accuracy.sh
+
+export SCRIPT="${SCRIPT:-online_inference.sh}"
+export FROZEN_GRAPH=/workspace/tf-gpu-ssd-mobilenet-inference/pretrained_models/ssdmobilenet_${PRECISION}_pretrained_model_gpu.pb
+
+if [[ ${SCRIPT} == accuracy.sh ]]; then
+  if [[ -z $DATASET_DIR ]]; then
+    echo "DATASET_DIR environment variable is not set!" >&2
+    exit 1
+  fi
+fi
+
+if [[ ${SCRIPT} != quickstart* ]]; then
+  SCRIPT="quickstart/$SCRIPT"
+fi
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+  --group-add ${VIDEO} \
+  ${RENDER_GROUP} \
+  --device=/dev/dri \
+  --ipc=host \
+  --privileged \
+  --env PRECISION=${PRECISION} \
+  --env OUTPUT_DIR=${OUTPUT_DIR} \
+  --env DATASET_DIR=${DATASET_DIR} \
+  --env FROZEN_GRAPH=${FROZEN_GRAPH} \
+  --env http_proxy=${http_proxy} \
+  --env https_proxy=${https_proxy} \
+  --env no_proxy=${no_proxy} \
+  --volume ${OUTPUT_DIR}:${OUTPUT_DIR} \
+  --volume ${DATASET_DIR}:${DATASET_DIR} \
+  ${DOCKER_ARGS} \
+  $IMAGE_NAME \
+  /bin/bash $SCRIPT
diff --git a/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/wrapper_README.md b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/wrapper_README.md
new file mode 100644
index 000000000..538477dba
--- /dev/null
+++ b/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/wrapper_README.md
@@ -0,0 +1,106 @@
+<!--- 0. Title -->
+# SSD-MobileNet inference
+
+<!-- 10. Description -->
+## Description
+
+This document has instructions for running SSD-MobileNet inference using
+Intel(R) Extension for TensorFlow with Intel(R) Data Center GPU Flex Series.
+
+<!--- 20. Model package -->
+## Model Package
+
+The model package includes the scripts and libraries needed to
+build and run SSD-MobileNet inference using a docker container. Note that
+this model container uses the Tensorflow ITEX GPU container as it's base,
+and it requires the `model-zoo:tensorflow-itex-gpu` image to be built before
+the model container is built.
+```
+tf-atsm-ssd-mobilenet-inference
+├── build.sh
+├── info.txt
+├── licenses
+│   ├── LICENSE
+│   └── third_party
+│       ├── Intel_Model_Zoo_v2.0_Container_tpps.txt
+│       ├── Intel_Model_Zoo_v2.0_ML_Container_tpps.txt
+│       ├── Intel_Model_Zoo_v2.3_PyTorch.txt
+│       └── licenses.txt
+├── model_packages
+│   └── tf-atsm-ssd-mobilenet-inference.tar.gz
+├── README.md
+├── run.sh
+└── tf-atsm-ssd-mobilenet-inference.Dockerfile
+```
+
+
+<!--- 30. Datasets -->
+## Datasets
+
+Download and preprocess the COCO dataset using the [instructions here](/datasets/coco/README.md).
+After running the conversion script you should have a directory with the
+COCO dataset in the TF records format.
+
+Set the `DATASET_DIR` to point to the TF records directory when running SSD-MobileNet.
+
+<!--- 40. Quick Start Scripts -->
+## Quick Start Scripts
+
+| Script name | Description |
+|:-------------:|:-------------:|
+| [`online_inference.sh`](online_inference.sh) | Runs online inference for int8 precision |
+| [`batch_inference.sh`](batch_inference.sh) | Runs batch inference for int8 precision |
+| [`accuracy.sh`](accuracy.sh) | Measures the model accuracy for int8 precision  |
+
+<!--- 60. Docker -->
+## Docker
+
+Requirements:
+* Host machine has Intel(R) Data Center GPU Flex Series.
+* GPU-compatible drivers need to be installed. Please follow the [link](https://registrationcenter.intel.com/en/products/download/4125/) to download.
+* Docker
+* Download and build the [Intel® Extension for TensorFlow (ITEX) container](https://registrationcenter.intel.com/en/products/subscription/956/)(`model-zoo:tensorflow-itex-gpu`)
+
+After extracting the tf-atsm-ssd-mobilenet-inference.tar.gz, use the `build.sh`
+script to build the container. After the container has been built, you can
+run the model inference using the `run.sh` script. Set environment variables
+for DATASET_DIR to the path to COCO TF records files,if you do not set it 
+dummy data will be used. However, for accuracy you have to provide the real dataset.
+Set OUTPUT_DIR to an output directory where log files will be written. 
+
+The `run.sh` script will execute one of the [quickstart](#quickstart) script
+using the container that was just built. By default, the
+`quickstart/online_inference.sh` script will be run. To change which
+script gets run, either edit the `run.sh` file, or specify the name of file
+to run using the `SCRIPT` environment variable.
+See the snippet below for an example on how to do this.
+
+> Note: Ensure that your system has the proxy environment variables
+> set, otherwise the container build may fail when trying to pull external
+> dependencies (like apt-get and pip installs).
+
+```
+#Extract the package
+tar -xzf tf-atsm-ssd-mobilenet-inference.tar.gz
+cd tf-atsm-ssd-mobilenet-inference
+
+# Build the container
+./build.sh
+
+#Set environment variables
+export PRECISION=int8
+export DATASET_DIR=<path to the dataset>
+export OUTPUT_DIR=<directory where log files will be written>
+
+# Run the container with the default online_inference.sh script
+./run.sh
+
+# Specify a different quickstart script to run
+SCRIPT=quickstart/batch_inference.sh ./run.sh
+```
+
+<!--- 80. License -->
+## License
+
+[LICENSE](/LICENSE)
+
diff --git a/quickstart/tf-tool-container/gpu/devcatalog.md b/quickstart/tf-tool-container/gpu/devcatalog.md
new file mode 100644
index 000000000..306e49ab3
--- /dev/null
+++ b/quickstart/tf-tool-container/gpu/devcatalog.md
@@ -0,0 +1,95 @@
+# Optimizations for Intel® Data Center GPU Flex Series using Intel® Extension for TensorFlow*
+
+## Overview
+
+This document has instruction for running Tensorflow using Intel GPU in container.
+
+## Requirements
+| Item | Detail |
+| ------ | ------- |
+| Host machine  | Intel® Data Center GPU Flex Series  |
+| Drivers | GPU-compatible drivers need to be installed:[Download Driver 476.14](https://dgpu-docs.intel.com/releases/stable_476_14_20221021.html)
+| Software | Docker* Installed |
+
+## Get Started
+
+### Installing the Intel Extensions for TensorFlow
+#### Docker pull command:
+
+`docker pull intel/intel-extension-for-tensorflow:gpu-flex`
+
+#### Running container:
+
+Run following commands to start TF GPU  tools container. You can use `-v` option to mount your
+local directory into container. 
+
+```
+IMAGE_NAME=intel/intel-extension-for-tensorflow:gpu-flex
+DOCKER_ARGS=${DOCKER_ARGS:---rm -it}
+
+VIDEO=$(getent group video | sed -E 's,^video:[^:]*:([^:]*):.*$,\1,')
+RENDER=$(getent group render | sed -E 's,^render:[^:]*:([^:]*):.*$,\1,')
+
+test -z "$RENDER" || RENDER_GROUP="--group-add ${RENDER}"
+
+docker run \
+    -v <your-local-dir>:/workspace \
+    --group-add ${VIDEO} \
+    ${RENDER_GROUP} \
+    -e http_proxy=$http_proxy \
+    -e https_proxy=$https_proxy \
+    -e no_proxy=$no_proxy \
+    ${DOCKER_ARGS} \
+    ${IMAGE_NAME} \
+    bash
+```
+
+##### Verify if GPU is accessible from Tensorflow:
+You are inside container now. Run following command to verify GPU is visible to Tensorflow:
+
+```
+python -c "from tensorflow.python.client import device_lib; print(device_lib.list_local_devices())"
+```
+You should be able to see GPU device in list of devices. Sample output looks like below:
+
+```
+[name: "/device:CPU:0"
+device_type: "CPU"
+memory_limit: 268435456
+locality {
+}
+incarnation: 9266936945121049176
+xla_global_id: -1
+, name: "/device:XPU:0"
+device_type: "XPU"
+locality {
+bus_id: 1
+}
+incarnation: 15031084974591766410
+physical_device_desc: "device: 0, name: INTEL_XPU, pci bus id: <undefined>"
+xla_global_id: -1
+, name: "/device:XPU:1"
+device_type: "XPU"
+locality {
+bus_id: 1
+}
+incarnation: 17448926295332318308
+physical_device_desc: "device: 1, name: INTEL_XPU, pci bus id: <undefined>"
+xla_global_id: -1
+]
+``` 
+## Documentation and Sources
+
+[GitHub* Repository](https://github.com/intel/intel-extension-for-tensorflow/tree/main/docker)
+
+## Summary and Next Steps
+
+Now you are inside container with Python 3.9 and Tensorflow 2.10.0 preinstalled. You can run your own script
+to run on intel GPU. 
+
+## Support
+Support for Intel® Extension for TensorFlow* is found via the [Intel® AI Analytics Toolkit.](https://www.intel.com/content/www/us/en/developer/tools/oneapi/ai-analytics-toolkit.html#gs.qbretz) Additionally, the Intel® Extension for TensorFlow* team tracks both bugs and enhancement requests using [GitHub issues](https://github.com/intel/intel-extension-for-tensorflow/issues). Before submitting a suggestion or bug report, please search the GitHub issues to see if your issue has already been reported.
+
+## License Agreement
+
+LEGAL NOTICE: By accessing, downloading or using this software and any required dependent software (the “Software Package”), you agree to the terms and conditions of the software license agreements for the Software Package, which may also include notices, disclaimers, or license terms for third party software included with the Software Package. Please refer to the [license file](https://github.com/IntelAI/models/tree/master/third_party) for additional details.
\ No newline at end of file
diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_bert_large_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_bert_large_args.json
new file mode 100644
index 000000000..9c526288e
--- /dev/null
+++ b/tests/unit/common/tensorflow/tf_model_args/tf_bert_large_args.json
@@ -0,0 +1,22 @@
+[
+  {
+    "_comment": "bert_large_fp32_inference_gpu",
+    "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=fp32 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=1 --batch-size=32 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --gpu --in-graph=/workspace/pretrained_model/bert_large_pb/frozen_graph.fp32.pb --data-location=/workspace/datasets/bert-large/data/SQuAD-v1.1 --infer-option=SQuAD",
+    "output": "python /workspace/intelai_models/inference/run_squad.py --vocab_file=/workspace/datasets/bert-large/data/SQuAD-v1.1/vocab.txt --bert_config_file=/workspace/datasets/bert-large/data/SQuAD-v1.1/bert_config.json --predict_file=/workspace/datasets/bert-large/data/SQuAD-v1.1/dev-v1.1.json --precision=fp32 --output_dir=/workspace/benchmarks/common/tensorflow/logs --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --input_graph=/workspace/pretrained_model/bert_large_pb/frozen_graph.fp32.pb --do_predict=True  --mode=benchmark --inter_op_parallelism_threads=2 --intra_op_parallelism_threads=1 --warmup_steps=10 --steps=30 --num_cores_per_socket=28"
+  },
+  {
+    "_comment": "bert_large_fp16_inference_gpu",
+    "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=fp16 --mode=inference --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=1 --batch-size=32 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --benchmark-only --gpu --in-graph=/workspace/pretrained_model/bert_large_pb/frozen_graph.fp16.pb --data-location=/workspace/datasets --infer-option=SQuAD",
+    "output": "python /workspace/intelai_models/inference/run_squad.py --vocab_file=/workspace/datasets/vocab.txt --bert_config_file=/workspace/datasets/bert_config.json --predict_file=/workspace/datasets/dev-v1.1.json --precision=fp16 --predict_batch_size=32 --experimental_gelu=False --optimized_softmax=True --input_graph=/workspace/pretrained_model/bert_large_pb/frozen_graph.fp16.pb --do_predict=True --mode=benchmark --output_dir=/workspace/benchmarks/common/tensorflow/logs"
+  },
+  {
+    "_comment": "bert_large_fp32_training_gpu",
+    "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=fp32 --mode=training --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --num-train-steps=1 --benchmark-only --gpu --disable-tcmalloc=True --train-option=SQuAD --init-checkpoint=/checkpoints/bert_model.ckpt --vocab-file=/datasets/vocab.txt --config-file=/datasets/bert_config.json --do-predict=True --predict-file=/datasets/SQuAD-v1.1/mini-dev-v1.1.json --do-train=True --train-file=/datasets/SQuAD-v1.1/train-v1.1.json --num-train-epochs=2 --learning-rate=3e-5 --max-seq-length=384 --doc-stride=128 --do-lower-case=True --experimental-gelu=False --optimized-softmax=True",
+    "output": "python /workspace/intelai_models/training/fp32/run_squad.py \\\n --output_dir=/workspace/benchmarks/common/tensorflow/logs \\\n --bert_config_file=/datasets/bert_config.json \\\n --do_train=True \\\n --train_batch_size=1 \\\n --accum_steps=1 \\\n --learning_rate=3e-05 \\\n --max_seq_length=384 \\\n --use_tpu=False \\\n --precision=fp32 \\\n --intra_op_parallelism_threads=56 \\\n --inter_op_parallelism_threads=2 \\\n --profile=False \\\n --do_lower_case=True \\\n --experimental_gelu=False \\\n --mpi_workers_sync_gradients=False \\\n --vocab_file=/datasets/vocab.txt \\\n --train_file=/datasets/SQuAD-v1.1/train-v1.1.json \\\n --predict_file=/datasets/SQuAD-v1.1/mini-dev-v1.1.json \\\n --do_predict=True \\\n --num_train_epochs=2.0 \\\n --init_checkpoint=/checkpoints/bert_model.ckpt \\\n --doc_stride=128"
+  },
+  {
+    "_comment": "bert_large_bfloat16_training_gpu",
+    "input": "run_tf_benchmark.py --framework=tensorflow --use-case=language_modeling --model-name=bert_large --precision=bfloat16 --mode=training --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --num-cores=-1 --batch-size=1 --socket-id=-1 --output-dir=/workspace/benchmarks/common/tensorflow/logs --num-train-steps=1 --benchmark-only --gpu --disable-tcmalloc=True --train-option=SQuAD --init-checkpoint=/checkpoints/bert_model.ckpt --vocab-file=/datasets/vocab.txt --config-file=/datasets/bert_config.json --do-predict=True --predict-file=/datasets/SQuAD-v1.1/mini-dev-v1.1.json --do-train=True --train-file=/datasets/SQuAD-v1.1/train-v1.1.json --num-train-epochs=2 --learning-rate=3e-5 --max-seq-length=384 --doc-stride=128 --do-lower-case=True --experimental-gelu=False --optimized-softmax=True",
+    "output": "python /workspace/intelai_models/training/bfloat16/run_squad.py \\\n --output_dir=/workspace/benchmarks/common/tensorflow/logs \\\n --bert_config_file=/datasets/bert_config.json \\\n --do_train=True \\\n --train_batch_size=1 \\\n --accum_steps=1 \\\n --learning_rate=3e-05 \\\n --max_seq_length=384 \\\n --use_tpu=False \\\n --precision=bfloat16 \\\n --intra_op_parallelism_threads=56 \\\n --inter_op_parallelism_threads=2 \\\n --profile=False \\\n --do_lower_case=True \\\n --experimental_gelu=False \\\n --optimized_softmax=True \\\n --mpi_workers_sync_gradients=False \\\n --vocab_file=/datasets/vocab.txt \\\n --train_file=/datasets/SQuAD-v1.1/train-v1.1.json \\\n --predict_file=/datasets/SQuAD-v1.1/mini-dev-v1.1.json \\\n --do_predict=True \\\n --num_train_epochs=2.0 \\\n --init_checkpoint=/checkpoints/bert_model.ckpt \\\n --doc_stride=128"
+  }
+]
diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json
index ae4ac867a..e9b0928c6 100644
--- a/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json
+++ b/tests/unit/common/tensorflow/tf_model_args/tf_resnet50v1_5_args.json
@@ -99,10 +99,40 @@
     "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --num-inter-threads=1 --num-intra-threads=4 --batch-size=1 --warmup-steps=10 --steps=50 --data-num-inter-threads=1 --data-num-intra-threads=4",
     "cpuset": "0-111"},
 
+  { "_comment": "resnet50v1_5_fp32_gpu",
+    "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp32 --mode inference --model-name resnet50v1_5 --gpu --batch-size 128 --in-graph /resnet50_v1.pb --intelai-models . --verbose --data-location /dataset_dir --output-dir /output_dir",
+    "output": "python ./inference/fp32/eval_image_classifier_inference.py --input-graph=/resnet50_v1.pb --num-cores=28 --batch-size=128 --warmup-steps=10 --steps=50 --data-location=/dataset_dir"},
+  { "_comment": "resnet50v1_5_fp16_gpu",
+    "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp16 --mode inference --model-name resnet50v1_5 --gpu --batch-size 128 --in-graph /resnet50_v1.5_fp16.pb --intelai-models . --verbose --data-location /dataset_dir --output-dir /output_dir",
+    "output": "python ./inference/fp16/eval_image_classifier_inference.py --input-graph=/resnet50_v1.5_fp16.pb --num-cores=28 --batch-size=128 --warmup-steps=10 --steps=50 --data-location=/dataset_dir"},
+  { "_comment": "resnet50v1_5_int8_gpu",
+    "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision int8 --mode inference --model-name resnet50v1_5 --gpu --batch-size 128 --in-graph /in_graph/resnet50v1_5_int8_pretrained_model.pb --intelai-models . --verbose --data-location /dataset_dir --output-dir /output_dir",
+    "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 python ./inference/int8/eval_image_classifier_inference.py --input-graph=/in_graph/resnet50v1_5_int8_pretrained_model.pb --batch-size=128 --warmup-steps=10 --steps=50 --data-location=/dataset_dir"},
+
+  { "_comment": "resnet50v1_5_fp32_training",
+    "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp32 --mode=training --gpu --batch-size=256 --intelai-models . --verbose --data-location /dataset_dir --output-dir /output_dir",
+    "output": "python ./training/mlperf_resnet/imagenet_main.py 1 --batch_size=256 --max_train_steps=112590 --train_epochs=72 --epochs_between_evals=1 --num_gpus 1 --stop_threshold 0.75 --version 1 --resnet_size 50 --data_dir=/dataset_dir"},
+  { "_comment": "resnet50v1_5_bf16_training",
+    "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=bfloat16 --mode=training --gpu --batch-size=256 --intelai-models . --verbose --data-location /dataset_dir --output-dir /output_dir",
+    "output": "python ./training/mlperf_resnet/imagenet_main.py 1 --batch_size=256 --max_train_steps=112590 --train_epochs=72 --epochs_between_evals=1 --num_gpus 1 --stop_threshold 0.75 --version 1 --resnet_size 50 --data_dir=/dataset_dir --use_bfloat16"},
+
     { "_comment": "resnet50v1_5_fp16_training",
-    "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp16 --mode=training --intelai-models=/workspace/intelai_models --checkpoint=/workspace/checkpoints --output-dir=/workspace/logs --steps=300 --train_epochs=10 --epochs_between_evals=2",
-    "output": "python /workspace/intelai_models/training/mlperf_resnet/imagenet_main.py 2 --batch_size=64 --max_train_steps=300 --train_epochs=10 --epochs_between_evals=2 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 56 --version 1 --resnet_size 50 --data_format=channels_last --model_dir=/workspace/checkpoints --use_float16",
-    "cpuset": "0-111"}
-]
+  "input": "run_tf_benchmark.py --framework=tensorflow --use-case=image_recognition --model-name=resnet50v1_5 --precision=fp16 --mode=training --intelai-models=/workspace/intelai_models --checkpoint=/workspace/checkpoints --output-dir=/workspace/logs --steps=300 --train_epochs=10 --epochs_between_evals=2",
+  "output": "python /workspace/intelai_models/training/mlperf_resnet/imagenet_main.py 2 --batch_size=64 --max_train_steps=300 --train_epochs=10 --epochs_between_evals=2 --inter_op_parallelism_threads 2 --intra_op_parallelism_threads 56 --version 1 --resnet_size 50 --data_format=channels_last --model_dir=/workspace/checkpoints --use_float16",
+  "cpuset": "0-111"},
 
+  { "_comment": "resnet50v1_5_fp16_online_inference",
+    "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp16 --mode inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --model-name resnet50v1_5 --batch-size=1 --data-location=/dataset --in-graph=resnet50v1_5.pb --steps=1500 --warmup-steps=50",
+    "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5.pb --data-type=fp16 --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=1 --warmup-steps=50 --steps=1500 --data-location=/dataset",
+    "cpuset": "0-111"},
+
+    { "_comment": "resnet50v1_5_fp16_batch_inference",
+    "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision fp16 --mode inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --model-name resnet50v1_5 --batch-size=-1 --data-location=/dataset --in-graph=resnet50v1_5.pb --steps=1500 --warmup-steps=50",
+    "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference.py --input-graph=resnet50v1_5.pb --data-type=fp16 --num-inter-threads=2 --num-intra-threads=56 --num-cores=28 --batch-size=128 --warmup-steps=50 --steps=1500 --data-location=/dataset",
+    "cpuset": "0-111"},
 
+    { "_comment": "resnet50v1_5_bfloat16_weight_sharing",
+    "input": "run_tf_benchmark.py --framework tensorflow --use-case image_recognition --precision bfloat16 --weight-sharing --mode inference --model-source-dir=/workspace/models --benchmark-dir=/workspace/benchmarks --intelai-models=/workspace/intelai_models --model-name resnet50v1_5 --batch-size=1 --in-graph=resnet50v1_5.pb --num-inter-threads=-1 --num-intra-threads=28 --steps=1500 --warmup-steps=50",
+    "output": "python /workspace/intelai_models/inference/eval_image_classifier_inference_weight_sharing.py --input-graph=resnet50v1_5.pb  --num-inter-threads=-1 --num-intra-threads=28 --num-cores=28 --batch-size=1 --warmup-steps=50 --steps=1500",
+    "cpuset": "0-111"}
+]
diff --git a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json
index b3659c5b3..488c869d9 100644
--- a/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json
+++ b/tests/unit/common/tensorflow/tf_model_args/tf_ssd_mobilenet_args.json
@@ -16,7 +16,7 @@
 
   { "_comment": "ssd_mobilenet_int8",
     "input": "run_tf_benchmark.py --framework=tensorflow --use-case=object_detection --model-name=ssd-mobilenet --precision=int8 --mode=inference --model-source-dir=/workspace/models --intelai-models=/workspace/intelai_models --batch-size=1 --socket-id 0 --data-location=/dataset --verbose --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb --benchmark-only --in-graph=/in_graph/ssdmobilenet_int8_pretrained_model.pb",
-    "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/infer_detections.py -g /in_graph/ssdmobilenet_int8_pretrained_model.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -b 1",
+    "output": "LD_PRELOAD=/usr/lib/libtcmalloc.so.4.2.6 numactl --cpunodebind=0 --membind=0 python /workspace/intelai_models/inference/int8/infer_detections.py -g /in_graph/ssdmobilenet_int8_pretrained_model.pb -i 1000 -w 200 -a 28 -e 1 -d /dataset -b 1 --benchmark",
     "cpuset": "0-111"},
 
   { "_comment": "ssd_mobilenet_bfloat16_inference",
diff --git a/tools/docker/partials/common/horovod-patch.partial.Dockerfile b/tools/docker/partials/common/horovod-patch.partial.Dockerfile
new file mode 100644
index 000000000..0d70671b2
--- /dev/null
+++ b/tools/docker/partials/common/horovod-patch.partial.Dockerfile
@@ -0,0 +1,4 @@
+ARG PACKAGE_NAME
+ARG MODEL_WORKSPACE
+
+RUN git apply ${MODEL_WORKSPACE}/${PACKAGE_NAME}/quickstart/hvs_support.patch
diff --git a/tools/docker/partials/common/intel-mpi-ccl.partial.Dockerfile b/tools/docker/partials/common/intel-mpi-ccl.partial.Dockerfile
new file mode 100644
index 000000000..8f0ecdffe
--- /dev/null
+++ b/tools/docker/partials/common/intel-mpi-ccl.partial.Dockerfile
@@ -0,0 +1,10 @@
+RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | apt-key add -
+RUN echo "deb [trusted=yes] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    intel-oneapi-mpi-devel \
+    intel-oneapi-ccl \
+    && \
+  rm -rf /var/lib/apt/lists/*
diff --git a/tools/docker/partials/common/intel-mpi-oneccl.partial.Dockerfile b/tools/docker/partials/common/intel-mpi-oneccl.partial.Dockerfile
new file mode 100644
index 000000000..ce5a3c6d0
--- /dev/null
+++ b/tools/docker/partials/common/intel-mpi-oneccl.partial.Dockerfile
@@ -0,0 +1,11 @@
+    
+RUN curl -fsSL https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS-2023.PUB | apt-key add -
+RUN echo "deb [trusted=yes] https://apt.repos.intel.com/oneapi all main " > /etc/apt/sources.list.d/oneAPI.list
+
+RUN apt-get update && \
+  DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \
+    ca-certificates \
+    intel-oneapi-mpi-devel \
+    intel-oneapi-ccl \
+    && \
+  rm -rf /var/lib/apt/lists/*
diff --git a/tools/docker/partials/common/pytorch/bert-inference-installs.partial.Dockerfile b/tools/docker/partials/common/pytorch/bert-inference-installs.partial.Dockerfile
new file mode 100644
index 000000000..07a288ccf
--- /dev/null
+++ b/tools/docker/partials/common/pytorch/bert-inference-installs.partial.Dockerfile
@@ -0,0 +1,7 @@
+ARG PACKAGE_NAME
+ARG MODEL_WORKSPACE
+
+RUN cd ${MODEL_WORKSPACE}/${PACKAGE_NAME}/models/language_modeling/pytorch/bert_large/inference/gpu && \
+    pip install -r requirements.txt 
+    
+RUN cd -
diff --git a/tools/docker/partials/common/pytorch/bert-training-installs.partial.Dockerfile b/tools/docker/partials/common/pytorch/bert-training-installs.partial.Dockerfile
new file mode 100644
index 000000000..6dac5f92f
--- /dev/null
+++ b/tools/docker/partials/common/pytorch/bert-training-installs.partial.Dockerfile
@@ -0,0 +1,9 @@
+ARG PACKAGE_NAME
+ARG MODEL_WORKSPACE
+    
+RUN pip install -r ${MODEL_WORKSPACE}/${PACKAGE_NAME}/models/language_modeling/pytorch/bert_large/training/gpu/requirements.txt
+
+RUN cd ${MODEL_WORKSPACE}/${PACKAGE_NAME}/models/language_modeling/pytorch/bert_large/training/gpu/data/ && \
+    wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt && \
+    mv bert-base-uncased-vocab.txt vocab. && \
+    cd -
diff --git a/tools/docker/partials/common/pytorch/gpu-ats-env.partial.Dockerfile b/tools/docker/partials/common/pytorch/gpu-ats-env.partial.Dockerfile
new file mode 100644
index 000000000..773db2c3f
--- /dev/null
+++ b/tools/docker/partials/common/pytorch/gpu-ats-env.partial.Dockerfile
@@ -0,0 +1,12 @@
+ENV ONEAPI_ROOT=/opt/intel/oneapi
+ENV DPCPP_ROOT=${ONEAPI_ROOT}/compiler/latest/linux
+ENV MKL_DPCPP_ROOT=${ONEAPI_ROOT}/mkl/latest
+ENV SYCLROOT=${DPCPP_ROOT}
+ENV LD_LIBRARY_PATH=${DPCPP_ROOT}/lib:${DPCPP_ROOT}/compiler/lib/intel64_lin:${LD_LIBRARY_PATH}
+ENV LD_LIBRARY_PATH=${MKL_DPCPP_ROOT}/lib:${MKL_DPCPP_ROOT}/lib64:${MKL_DPCPP_ROOT}/lib/intel64:${LD_LIBRARY_PATH}
+ENV INTELOCLSDKROOT=${DPCPP_ROOT}
+ENV PATH=${DPCPP_ROOT}/bin:${PATH}
+ENV CXX=$DPCPP_ROOT/bin/clang++
+ENV CPATH=$DPCPP_ROOT/include:$CPATH
+
+ENV USE_AOT_DEVLIST='xe_hp_sdv'
diff --git a/tools/docker/partials/common/pytorch/gpu-base-public.partial.Dockerfile b/tools/docker/partials/common/pytorch/gpu-base-public.partial.Dockerfile
new file mode 100644
index 000000000..2da5d7f6d
--- /dev/null
+++ b/tools/docker/partials/common/pytorch/gpu-base-public.partial.Dockerfile
@@ -0,0 +1,4 @@
+ARG PYTORCH_BASE_IMAGE="intel/intel-extension-for-pytorch"
+ARG PYTORCH_BASE_TAG="gpu"
+
+FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}
diff --git a/tools/docker/partials/common/pytorch/gpu-base.partial.Dockerfile b/tools/docker/partials/common/pytorch/gpu-base.partial.Dockerfile
new file mode 100644
index 000000000..600e4ac58
--- /dev/null
+++ b/tools/docker/partials/common/pytorch/gpu-base.partial.Dockerfile
@@ -0,0 +1,4 @@
+ARG PYTORCH_BASE_IMAGE="model-zoo"
+ARG PYTORCH_BASE_TAG="pytorch-ipex-gpu"
+
+FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}
diff --git a/tools/docker/partials/common/pytorch/gpu-pytorch-whl.partial.Dockerfile b/tools/docker/partials/common/pytorch/gpu-pytorch-whl.partial.Dockerfile
new file mode 100644
index 000000000..e8e5575a0
--- /dev/null
+++ b/tools/docker/partials/common/pytorch/gpu-pytorch-whl.partial.Dockerfile
@@ -0,0 +1,9 @@
+ARG PYTORCH_WHEEL
+ARG IPEX_WHEEL
+ADD models/binaries/$PYTORCH_WHEEL /tmp/
+ADD models/binaries/$IPEX_WHEEL /tmp/
+
+RUN pip install typing-extensions>=3.6.2.1 numpy>=1.16.6 urllib3==1.25.4 six && \
+    ( for whl in ${PYTORCH_WHEEL} ${IPEX_WHEEL}; do \
+    pip install /tmp/$whl && \
+    rm /tmp/$whl; done )
diff --git a/tools/docker/partials/common/pytorch/max-series-base-public.partial.Dockerfile b/tools/docker/partials/common/pytorch/max-series-base-public.partial.Dockerfile
new file mode 100644
index 000000000..081a60ee5
--- /dev/null
+++ b/tools/docker/partials/common/pytorch/max-series-base-public.partial.Dockerfile
@@ -0,0 +1,4 @@
+ARG PYTORCH_BASE_IMAGE="intel/intel-extension-for-pytorch"
+ARG PYTORCH_BASE_TAG="xpu-max"
+
+FROM ${PYTORCH_BASE_IMAGE}:${PYTORCH_BASE_TAG}
diff --git a/tools/docker/partials/common/pytorch/torch-ccl-horovod-whl.partial.Dockerfile b/tools/docker/partials/common/pytorch/torch-ccl-horovod-whl.partial.Dockerfile
new file mode 100644
index 000000000..b0f318a0a
--- /dev/null
+++ b/tools/docker/partials/common/pytorch/torch-ccl-horovod-whl.partial.Dockerfile
@@ -0,0 +1,9 @@
+ARG TORCH_CCL_WHEEL
+ARG HOROVOD_WHEEL
+ADD models/binaries/$TORCH_CCL_WHEEL /tmp/
+ADD models/binaries/$HOROVOD_WHEEL /tmp/
+
+RUN pip install /tmp/$TORCH_CCL_WHEEL && \
+    pip install /tmp/$HOROVOD_WHEEL && \
+    rm /tmp/$TORCH_CCL_WHEEL && \
+    rm /tmp/$HOROVOD_WHEEL
diff --git a/tools/docker/partials/common/pytorch/torch-ccl-source.partial.Dockerfile b/tools/docker/partials/common/pytorch/torch-ccl-source.partial.Dockerfile
new file mode 100644
index 000000000..d6c43d4ee
--- /dev/null
+++ b/tools/docker/partials/common/pytorch/torch-ccl-source.partial.Dockerfile
@@ -0,0 +1,9 @@
+ARG TORCH_CCL_PATH=models/torch_ccl
+ARG HOROVOD_WHEEL
+ADD models/binaries/$HOROVOD_WHEEL /tmp/
+
+ADD $TORCH_CCL_PATH /tmp/torch_ccl
+
+RUN pip install /tmp/$HOROVOD_WHEEL && \
+    cd /tmp/torch_ccl && \
+    python setup.py install
diff --git a/tools/docker/partials/common/pytorch/torchvision.partial.Dockerfile b/tools/docker/partials/common/pytorch/torchvision.partial.Dockerfile
new file mode 100644
index 000000000..089d35399
--- /dev/null
+++ b/tools/docker/partials/common/pytorch/torchvision.partial.Dockerfile
@@ -0,0 +1,2 @@
+ARG TORCH_VISION
+RUN pip install torchvision==${TORCH_VISION}
diff --git a/tools/docker/partials/common/tensorflow/gpu-ats-env.partial.Dockerfile b/tools/docker/partials/common/tensorflow/gpu-ats-env.partial.Dockerfile
new file mode 100644
index 000000000..581f48ba7
--- /dev/null
+++ b/tools/docker/partials/common/tensorflow/gpu-ats-env.partial.Dockerfile
@@ -0,0 +1,16 @@
+ENV TF_NUM_INTEROP_THREADS=1
+
+ENV MKL_ROOT=/opt/intel/oneapi/mkl/latest
+
+ENV LD_PRELOAD=$MKL_ROOT/lib/intel64/libmkl_rt.so
+
+ENV dpcpp_root=${compiler_path}
+
+ENV PATH=$dpcpp_root/bin:${PATH}
+
+ENV LD_LIBRARY_PATH=$dpcpp_root/lib:$dpcpp_root/compiler/lib/intel64_lin:${LD_LIBRARY_PATH}
+
+ENV ITEX_ENABLE_ONEDNN_LAYOUT_OPT=1
+
+ENV OverrideSystolicPipelineSelect=1
+
diff --git a/tools/docker/partials/common/tensorflow/gpu-base-public.partial.Dockerfile b/tools/docker/partials/common/tensorflow/gpu-base-public.partial.Dockerfile
new file mode 100644
index 000000000..58edbbcfa
--- /dev/null
+++ b/tools/docker/partials/common/tensorflow/gpu-base-public.partial.Dockerfile
@@ -0,0 +1,4 @@
+ARG TENSORFLOW_BASE_IMAGE="intel/intel-extension-for-tensorflow"
+ARG TENSORFLOW_BASE_TAG="gpu"
+
+FROM ${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG}
diff --git a/tools/docker/partials/common/tensorflow/gpu-base.partial.Dockerfile b/tools/docker/partials/common/tensorflow/gpu-base.partial.Dockerfile
new file mode 100644
index 000000000..fabefd00f
--- /dev/null
+++ b/tools/docker/partials/common/tensorflow/gpu-base.partial.Dockerfile
@@ -0,0 +1,4 @@
+ARG TENSORFLOW_BASE_IMAGE="model-zoo"
+ARG TENSORFLOW_BASE_TAG="tensorflow-itex-gpu"
+
+FROM ${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG}
diff --git a/tools/docker/partials/common/tensorflow/max-series-base-public.partial.Dockerfile b/tools/docker/partials/common/tensorflow/max-series-base-public.partial.Dockerfile
new file mode 100644
index 000000000..c33fbdf53
--- /dev/null
+++ b/tools/docker/partials/common/tensorflow/max-series-base-public.partial.Dockerfile
@@ -0,0 +1,4 @@
+ARG TENSORFLOW_BASE_IMAGE="intel/intel-extension-for-tensorflow"
+ARG TENSORFLOW_BASE_TAG="gpu-max"
+
+FROM ${TENSORFLOW_BASE_IMAGE}:${TENSORFLOW_BASE_TAG}
diff --git a/tools/docker/partials/ubuntu/entrypoint.partial.Dockerfile b/tools/docker/partials/ubuntu/entrypoint.partial.Dockerfile
index 31f0cc07b..e461d66f6 100644
--- a/tools/docker/partials/ubuntu/entrypoint.partial.Dockerfile
+++ b/tools/docker/partials/ubuntu/entrypoint.partial.Dockerfile
@@ -10,6 +10,7 @@ RUN apt-get update && \
     apt-get install --no-install-recommends --fix-missing -y gosu
 
 RUN echo '#!/bin/bash\n\
+[ -f /opt/intel/oneapi/setvars.sh ] && . /opt/intel/oneapi/setvars.sh\n\
 USER_ID=$USER_ID\n\
 USER_NAME=$USER_NAME\n\
 GROUP_ID=$GROUP_ID\n\
diff --git a/tools/docker/partials/ubuntu/pciutils.partial.Dockerfile b/tools/docker/partials/ubuntu/pciutils.partial.Dockerfile
new file mode 100644
index 000000000..e4e05a155
--- /dev/null
+++ b/tools/docker/partials/ubuntu/pciutils.partial.Dockerfile
@@ -0,0 +1 @@
+RUN apt-get install -y pciutils
diff --git a/tools/docker/partials/ubuntu/python-build.partial.Dockerfile b/tools/docker/partials/ubuntu/python-build.partial.Dockerfile
index 260a15a5f..334cfc12e 100644
--- a/tools/docker/partials/ubuntu/python-build.partial.Dockerfile
+++ b/tools/docker/partials/ubuntu/python-build.partial.Dockerfile
@@ -1,6 +1,6 @@
-ARG PY_VERSION=3
+ARG PY_VERSION=3.9
 
 RUN apt-get update && \
     apt-get install -y --no-install-recommends --fix-missing \
-        build-essential \
-        python${PY_VERSION}-dev
+    build-essential \
+    python${PY_VERSION}-dev
diff --git a/tools/docker/partials/ubuntu/python3.partial.Dockerfile b/tools/docker/partials/ubuntu/python3.partial.Dockerfile
new file mode 100644
index 000000000..64adda57d
--- /dev/null
+++ b/tools/docker/partials/ubuntu/python3.partial.Dockerfile
@@ -0,0 +1,23 @@
+ENV LANG C.UTF-8
+ARG PYTHON=python3
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    curl \
+    software-properties-common
+
+RUN add-apt-repository ppa:deadsnakes/ppa
+
+RUN apt-get update && apt-get install -y --no-install-recommends --fix-missing \
+    ${PYTHON} lib${PYTHON} ${PYTHON}-distutils
+
+RUN curl -fSsL https://bootstrap.pypa.io/get-pip.py | ${PYTHON}
+
+RUN ${PYTHON} -m pip --no-cache-dir install --upgrade \
+    pip \
+    setuptools
+
+# Some TF tools expect a "python" binary
+RUN ln -sf $(which ${PYTHON}) /usr/local/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/local/bin/python3 && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python && \
+    ln -sf $(which ${PYTHON}) /usr/bin/python3
diff --git a/tools/docker/specs/pytorch/base_spec.yml b/tools/docker/specs/pytorch/base_spec.yml
index 2effd6b3d..06dfc45ee 100644
--- a/tools/docker/specs/pytorch/base_spec.yml
+++ b/tools/docker/specs/pytorch/base_spec.yml
@@ -34,6 +34,39 @@ slice_sets:
       args:
           - UBUNTU_VERSION=20.04
 
+    pytorch-ipex-gpu-base:
+        - add_to_name: ""
+          dockerfile_exclusive_name: pytorch-gpu
+          partials:
+              - pytorch/gpu-base
+          files:
+               - source: LICENSE
+                 destination: licenses/LICENSE
+               - source: third_party
+                 destination: licenses/third_party
+
+    pytorch-ipex-gpu-base-public:
+        - add_to_name: ""
+          dockerfile_exclusive_name: pytorch-atsm
+          partials:
+              - pytorch/gpu-base-public
+          files:
+               - source: LICENSE
+                 destination: licenses/LICENSE
+               - source: third_party
+                 destination: licenses/third_party
+
+    pytorch-ipex-max-series-base-public:
+        - add_to_name: ""
+          dockerfile_exclusive_name: pytorch-max-series
+          partials:
+              - pytorch/max-series-base-public
+          files:
+               - source: LICENSE
+                 destination: licenses/LICENSE
+               - source: third_party
+                 destination: licenses/third_party
+
     pytorch:
         - add_to_name: "pytorch"
           partials:
diff --git a/tools/docker/specs/pytorch/pytorch-atsm-resnet50v1-5-inference_spec.yml b/tools/docker/specs/pytorch/pytorch-atsm-resnet50v1-5-inference_spec.yml
new file mode 100644
index 000000000..d990da245
--- /dev/null
+++ b/tools/docker/specs/pytorch/pytorch-atsm-resnet50v1-5-inference_spec.yml
@@ -0,0 +1,89 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{pytorch-ipex-gpu-base-public}{pytorch-atsm-resnet50v1-5-inference}'
+slice_sets:
+  pytorch-atsm-resnet50v1-5-inference:
+  - add_to_name: pytorch-atsm-resnet50v1-5-inference
+    dockerfile_exclusive_name: -resnet50v1-5-inference
+    args:
+    - PACKAGE_NAME=pytorch-atsm-resnet50v1-5-inference
+    dockerfile_subdirectory: gpu_model_containers
+    documentation:
+      - docs:
+        - name: Title
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/title.md
+        - name: Description
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/description.md
+        - name: GPU Setup
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/gpu_setup.md
+        - name: Datasets
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/datasets.md
+        - name: Quick Start Scripts
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/quickstart.md
+        - name: Baremetal
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/baremetal.md
+        - name: License
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/license.md
+        name: README.md
+        text_replace:
+          <device>: Intel® Data Center GPU Flex Series
+          <mode>: inference
+          <model name>: ResNet50v1.5
+          <package dir>: pytorch-atsm-resnet50v1-5-inference
+          <use case>: image_recognition
+        uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu
+      - docs:
+        - name: Title
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/title.md
+        - name: Description
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/description.md
+        - name: Package
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/package.md
+        - name: Datasets
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/datasets.md
+        - name: Quick Start Scripts
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/quickstart.md
+        - name: Docker
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/docker.md
+        - name: License
+          uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/.docs/license.md
+        name: wrapper_README.md
+        text_replace:
+          <device>: Intel® Data Center GPU Flex Series
+          <mode>: inference
+          <model name>: ResNet50v1.5
+          <package dir>: pytorch-atsm-resnet50v1-5-inference
+          <package name>: pytorch-atsm-resnet50v1-5-inference.tar.gz
+          <use case>: image_recognition
+        uri: models/quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu
+    downloads: []
+    files:
+    - source: quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/README.md
+      destination: README.md
+    - destination: models/image_recognition/pytorch/resnet50v1_5/inference/gpu
+      source: models/image_recognition/pytorch/resnet50v1_5/inference/gpu
+    - destination: quickstart/inference_block_format.sh
+      source: quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/inference_block_format.sh
+    - destination: quickstart/setup.sh
+      source: quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/setup.sh
+    wrapper_package_files:
+     - source: quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/wrapper_README.md
+       destination: README.md
+     - source: quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/build.sh
+       destination: build.sh
+     - source: quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/run.sh
+       destination: run.sh
+     - source: dockerfiles/gpu_model_containers/pytorch-atsm-resnet50v1-5-inference.Dockerfile
+       destination: pytorch-atsm-resnet50v1-5-inference.Dockerfile
+     - source: output/pytorch-atsm-resnet50v1-5-inference.tar.gz
+       destination: model_packages/pytorch-atsm-resnet50v1-5-inference.tar.gz
+     - source: ""
+       destination: info.txt
+     - source: third_party
+       destination: licenses/third_party
+     - source: LICENSE
+       destination: licenses/LICENSE
+    partials:
+    - model_package
+    - entrypoint
diff --git a/tools/docker/specs/pytorch/pytorch-atsm-ssd-mobilenet-inference_spec.yml b/tools/docker/specs/pytorch/pytorch-atsm-ssd-mobilenet-inference_spec.yml
new file mode 100644
index 000000000..139753058
--- /dev/null
+++ b/tools/docker/specs/pytorch/pytorch-atsm-ssd-mobilenet-inference_spec.yml
@@ -0,0 +1,97 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{pytorch-ipex-gpu-base-public}{pytorch-atsm-ssd-mobilenet-inference}'
+slice_sets:
+  pytorch-atsm-ssd-mobilenet-inference:
+  - add_to_name: pytorch-atsm-ssd-mobilenet-inference
+    dockerfile_exclusive_name: -ssd-mobilenet-inference
+    args:
+    - PACKAGE_NAME=pytorch-atsm-ssd-mobilenet-inference
+    dockerfile_subdirectory: gpu_model_containers
+    documentation:
+      - docs:
+        - name: Title
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/title.md
+        - name: Description
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/description.md
+        - name: GPU Setup
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/gpu_setup.md
+        - name: Datasets
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/datasets.md
+        - name: Quick Start Scripts
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/quickstart.md
+        - name: Baremetal
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/baremetal.md
+        - name: License
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/license.md
+        name: README.md
+        text_replace:
+          <device>: Intel® Data Center GPU Flex Series
+          <mode>: inference
+          <model name>: SSD-Mobilenetv1
+          <package dir>: pytorch-atsm-ssd-mobilenet-inference
+          <use case>: object_detection
+        uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu
+      - docs:
+        - name: Title
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/title.md
+        - name: Description
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/description.md
+        - name: Package
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/package.md
+        - name: Datasets
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/datasets.md
+        - name: Quick Start Scripts
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/quickstart.md
+        - name: Docker
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/docker.md
+        - name: License
+          uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/.docs/license.md
+        name: wrapper_README.md
+        text_replace:
+          <device>: Intel® Data Center GPU Flex Series
+          <mode>: inference
+          <model name>: SSD-Mobilenetv1
+          <package dir>: pytorch-atsm-ssd-mobilenet-inference
+          <package name>: pytorch-atsm-ssd-mobilenet-inference.tar.gz
+          <use case>: object_detection
+        uri: models/quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu
+    downloads:
+    - destination: labels/voc-model-labels.txt
+      source: https://storage.googleapis.com/models-hao/voc-model-labels.txt
+    files:
+    - source: quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/README.md
+      destination: README.md
+    - destination: models/object_detection/pytorch/ssd-mobilenet/inference/gpu
+      source: models/object_detection/pytorch/ssd-mobilenet/inference/gpu
+    - destination: quickstart/setup.sh
+      source: quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/setup.sh
+    - destination: quickstart/inference_with_dummy_data.sh
+      source: quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/inference_with_dummy_data.sh
+    - destination: quickstart/setup.sh
+      source: quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/setup.sh
+    wrapper_package_files:
+     - source: quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/README.md
+       destination: README.md
+     - source: quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/build.sh
+       destination: build.sh
+     - source: quickstart/object_detection/pytorch/ssd-mobilenet/inference/gpu/run.sh
+       destination: run.sh
+     - source: dockerfiles/gpu_model_containers/pytorch-atsm-ssd-mobilenet-inference.Dockerfile
+       destination: pytorch-atsm-ssd-mobilenet-inference.Dockerfile
+     - source: output/pytorch-atsm-ssd-mobilenet-inference.tar.gz
+       destination: model_packages/pytorch-atsm-ssd-mobilenet-inference.tar.gz
+     - source: ""
+       destination: info.txt
+     - source: third_party
+       destination: licenses/third_party
+     - source: LICENSE
+       destination: licenses/LICENSE
+    partials:
+    - numactl
+    - python-build
+    - opencv
+    - object_detection/pip_installs
+    - model_package
+    - entrypoint
diff --git a/tools/docker/specs/pytorch/pytorch-atsm-yolov4-inference_spec.yml b/tools/docker/specs/pytorch/pytorch-atsm-yolov4-inference_spec.yml
new file mode 100644
index 000000000..62230434e
--- /dev/null
+++ b/tools/docker/specs/pytorch/pytorch-atsm-yolov4-inference_spec.yml
@@ -0,0 +1,97 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{pytorch-ipex-gpu-base-public}{pytorch-atsm-yolov4-inference}'
+slice_sets:
+  pytorch-atsm-yolov4-inference:
+  - add_to_name: -pytorch-atsm-yolov4-inference
+    dockerfile_exclusive_name: -yolov4-inference
+    args:
+    - PACKAGE_NAME=pytorch-atsm-yolov4-inference
+    dockerfile_subdirectory: gpu_model_containers
+    documentation:
+      - docs:
+        - name: Title
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/title.md
+        - name: Description
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/description.md
+        - name : GPU Setup
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/gpu_setup.md
+        - name: Datasets
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/datasets.md
+        - name: Quick Start Scripts
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/quickstart.md
+        - name: Baremetal
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/baremetal.md
+        - name: License
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/license.md
+        name: README.md
+        text_replace:
+          <device>: Intel® Data Center GPU Flex Series
+          <mode>: inference
+          <model name>: YOLOv4
+          <model-precision-mode>: pytorch-atsm-yolov4-inference
+          <package dir>: pytorch-atsm-yolov4-inference
+          <use case>: object_detection
+        uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu
+      - docs:
+        - name: Title
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/title.md
+        - name: Description
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/description.md
+        - name: Package
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/package.md
+        - name: Datasets
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/datasets.md
+        - name: Quick Start Scripts
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/quickstart.md
+        - name: Docker
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/docker.md
+        - name: License
+          uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu/.docs/license.md
+        name: wrapper_README.md
+        text_replace:
+          <device>: Intel® Data Center GPU Flex Series
+          <mode>: inference
+          <model name>: YOLOv4
+          <model-precision-mode>: pytorch-atsm-yolov4-inference
+          <package dir>: pytorch-atsm-yolov4-inference
+          <package name>: pytorch-atsm-yolov4-inference.tar.gz
+          <use case>: object_detection
+        uri: models/quickstart/object_detection/pytorch/yolov4/inference/gpu
+    downloads: []
+    files:
+    - source: quickstart/object_detection/pytorch/yolov4/inference/gpu/README.md
+      destination: README.md
+    - destination: models/object_detection/pytorch/yolov4/inference/gpu
+      source: models/object_detection/pytorch/yolov4/inference/gpu
+    - destination: quickstart/inference_with_dummy_data.sh
+      source: quickstart/object_detection/pytorch/yolov4/inference/gpu/inference_with_dummy_data.sh
+    - destination: quickstart/requirements.txt
+      source: quickstart/object_detection/pytorch/yolov4/inference/gpu/requirements.txt
+    - destination: quickstart/setup.sh
+      source: quickstart/object_detection/pytorch/yolov4/inference/gpu/setup.sh
+    wrapper_package_files:
+     - source: quickstart/object_detection/pytorch/yolov4/inference/gpu/README.md
+       destination: README.md
+     - source: quickstart/object_detection/pytorch/yolov4/inference/gpu/build.sh
+       destination: build.sh
+     - source: quickstart/object_detection/pytorch/yolov4/inference/gpu/run.sh
+       destination: run.sh
+     - source: dockerfiles/gpu_model_containers/pytorch-atsm-yolov4-inference.Dockerfile
+       destination: pytorch-atsm-yolov4-inference.Dockerfile
+     - source: output/pytorch-atsm-yolov4-inference.tar.gz
+       destination: model_packages/pytorch-atsm-yolov4-inference.tar.gz
+     - source: ""
+       destination: info.txt
+     - source: third_party
+       destination: licenses/third_party
+     - source: LICENSE
+       destination: licenses/LICENSE
+    partials:
+    - numactl
+    - python-build
+    - opencv
+    - object_detection/pip_installs
+    - model_package
+    - entrypoint
diff --git a/tools/docker/specs/pytorch/pytorch-max-series-bert-large-inference_spec.yml b/tools/docker/specs/pytorch/pytorch-max-series-bert-large-inference_spec.yml
new file mode 100644
index 000000000..8130da708
--- /dev/null
+++ b/tools/docker/specs/pytorch/pytorch-max-series-bert-large-inference_spec.yml
@@ -0,0 +1,46 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{pytorch-ipex-max-series-base-public}{pytorch-max-series-bert-large-inference}'
+slice_sets:
+  pytorch-max-series-bert-large-inference:
+  - add_to_name: pytorch-max-series-bert-large-inference
+    dockerfile_exclusive_name: -bert-large-inference
+    args:
+    - PACKAGE_NAME=pytorch-max-series-bert-large-inference
+    dockerfile_subdirectory: gpu_model_containers
+    downloads: []
+    files:
+    - source: quickstart/language_modeling/pytorch/bert_large/inference/gpu/README.md
+      destination: README.md
+    - destination: models/language_modeling/pytorch/bert_large/inference/gpu
+      source: models/language_modeling/pytorch/bert_large/inference/gpu
+    - source: models/language_modeling/pytorch/bert_large/inference/gpu/requirements.txt
+      destination: requirements.txt
+    - destination: quickstart/fp16_inference_block_format.sh
+      source: quickstart/language_modeling/pytorch/bert_large/inference/gpu/fp16_inference_block_format.sh
+    - destination: quickstart/fp16_inference_plain_format.sh
+      source: quickstart/language_modeling/pytorch/bert_large/inference/gpu/fp16_inference_plain_format.sh
+    - destination: quickstart/fp32_fine_tuning_accuracy.sh
+      source: quickstart/language_modeling/pytorch/bert_large/inference/gpu/fp32_fine_tuning_accuracy.sh
+    - destination: quickstart/fp32_inference.sh
+      source: quickstart/language_modeling/pytorch/bert_large/inference/gpu/fp32_inference.sh
+    - source: quickstart/common/pytorch/gpu/setvars.sh
+      destination: quickstart/setvars.sh
+    wrapper_package_files:
+     - source: quickstart/language_modeling/pytorch/bert_large/inference/gpu/build.sh
+       destination: build.sh
+     - source: dockerfiles/gpu_model_containers/pytorch-max-series-bert-large-inference.Dockerfile
+       destination: pytorch-max-series-bert-large-inference.Dockerfile
+     - source: output/pytorch-max-series-bert-large-inference.tar.gz
+       destination: model_packages/pytorch-max-series-bert-large-inference.tar.gz
+     - source: ""
+       destination: info.txt
+     - source: third_party
+       destination: licenses/third_party
+     - source: LICENSE
+       destination: licenses/LICENSE
+    partials:
+    - model_package
+    - pytorch/bert-inference-installs
+    - entrypoint
diff --git a/tools/docker/specs/pytorch/pytorch-max-series-bert-large-training_spec.yml b/tools/docker/specs/pytorch/pytorch-max-series-bert-large-training_spec.yml
new file mode 100644
index 000000000..98ab9857d
--- /dev/null
+++ b/tools/docker/specs/pytorch/pytorch-max-series-bert-large-training_spec.yml
@@ -0,0 +1,41 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{pytorch-ipex-max-series-base-public}{pytorch-max-series-bert-large-training}'
+slice_sets:
+  pytorch-max-series-bert-large-training:
+  - add_to_name: pytorch-max-series-bert-large-training
+    dockerfile_exclusive_name: -bert-large-training
+    args:
+    - PACKAGE_NAME=pytorch-max-series-bert-large-training
+    dockerfile_subdirectory: gpu_model_containers
+    downloads: []
+    files:
+    - source: quickstart/language_modeling/pytorch/bert_large/training/gpu/README.md
+      destination: README.md
+    - destination: models/language_modeling/pytorch/bert_large/training/gpu
+      source: models/language_modeling/pytorch/bert_large/training/gpu
+    - destination: quickstart/bf16_training_plain_format.sh
+      source: quickstart/language_modeling/pytorch/bert_large/training/gpu/bf16_training_plain_format.sh
+    - destination: quickstart/ddp_bf16_training_plain_format.sh
+      source: quickstart/language_modeling/pytorch/bert_large/training/gpu/ddp_bf16_training_plain_format.sh
+    - source: quickstart/common/pytorch/gpu/setvars.sh
+      destination: quickstart/setvars.sh
+    wrapper_package_files:
+     - source: quickstart/language_modeling/pytorch/bert_large/training/gpu/build.sh
+       destination: build.sh
+     - source: dockerfiles/gpu_model_containers/pytorch-max-series-bert-large-training.Dockerfile
+       destination: pytorch-max-series-bert-large-training.Dockerfile
+     - source: output/pytorch-max-series-bert-large-training.tar.gz
+       destination: model_packages/pytorch-max-series-bert-large-training.tar.gz
+     - source: ""
+       destination: info.txt
+     - source: third_party
+       destination: licenses/third_party
+     - source: LICENSE
+       destination: licenses/LICENSE
+    partials:
+    - intel-mpi-oneccl
+    - model_package
+    - pytorch/bert-training-installs
+    - entrypoint
diff --git a/tools/docker/specs/pytorch/pytorch-max-series-resnet50v1-5-inference_spec.yml b/tools/docker/specs/pytorch/pytorch-max-series-resnet50v1-5-inference_spec.yml
new file mode 100644
index 000000000..2eb58e966
--- /dev/null
+++ b/tools/docker/specs/pytorch/pytorch-max-series-resnet50v1-5-inference_spec.yml
@@ -0,0 +1,37 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{pytorch-ipex-max-series-base-public}{pytorch-max-series-resnet50v1-5-inference}'
+slice_sets:
+  pytorch-max-series-resnet50v1-5-inference:
+  - add_to_name: pytorch-max-series-resnet50v1-5-inference
+    dockerfile_exclusive_name: -resnet50v1-5-inference
+    args:
+    - PACKAGE_NAME=pytorch-max-series-resnet50v1-5-inference
+    dockerfile_subdirectory: gpu_model_containers
+    downloads: []
+    files:
+    - source: quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/README.md
+      destination: README.md
+    - destination: models/image_recognition/pytorch/resnet50v1_5/inference/gpu
+      source: models/image_recognition/pytorch/resnet50v1_5/inference/gpu
+    - destination: quickstart/inference_block_format.sh
+      source: quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/inference_block_format.sh
+    - destination: quickstart/setup.sh
+      source: quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/setup.sh
+    wrapper_package_files:
+     - source: quickstart/image_recognition/pytorch/resnet50v1_5/inference/gpu/build.sh
+       destination: build.sh
+     - source: dockerfiles/gpu_model_containers/pytorch-max-series-resnet50v1-5-inference.Dockerfile
+       destination: pytorch-max-series-resnet50v1-5-inference.Dockerfile
+     - source: output/pytorch-max-series-resnet50v1-5-inference.tar.gz
+       destination: model_packages/pytorch-max-series-resnet50v1-5-inference.tar.gz
+     - source: ""
+       destination: info.txt
+     - source: third_party
+       destination: licenses/third_party
+     - source: LICENSE
+       destination: licenses/LICENSE
+    partials:
+    - model_package
+    - entrypoint
diff --git a/tools/docker/specs/pytorch/pytorch-max-series-resnet50v1-5-training_spec.yml b/tools/docker/specs/pytorch/pytorch-max-series-resnet50v1-5-training_spec.yml
new file mode 100644
index 000000000..f3fc66ed9
--- /dev/null
+++ b/tools/docker/specs/pytorch/pytorch-max-series-resnet50v1-5-training_spec.yml
@@ -0,0 +1,41 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{pytorch-ipex-max-series-base-public}{pytorch-max-series-resnet50v1-5-training}'
+slice_sets:
+  pytorch-max-series-resnet50v1-5-training:
+  - add_to_name: pytorch-max-series-resnet50v1-5-training
+    dockerfile_exclusive_name: -resnet50v1-5-training
+    args:
+    - PACKAGE_NAME=pytorch-max-series-resnet50v1-5-training
+    dockerfile_subdirectory: gpu_model_containers
+    downloads: []
+    files:
+    - source: quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/README.md
+      destination: README.md
+    - destination: models/image_recognition/pytorch/resnet50v1_5/training/gpu
+      source: models/image_recognition/pytorch/resnet50v1_5/training/gpu
+    - destination: quickstart/training_plain_format.sh
+      source: quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/training_plain_format.sh
+    - destination: quickstart/ddp_training_plain_format_nchw.sh
+      source: quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/ddp_training_plain_format_nchw.sh
+    - destination: quickstart/setup.sh
+      source: quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/setup.sh
+    wrapper_package_files:
+     - source: quickstart/image_recognition/pytorch/resnet50v1_5/training/gpu/build.sh
+       destination: build.sh
+     - source: dockerfiles/gpu_model_containers/pytorch-max-series-resnet50v1-5-training.Dockerfile
+       destination: pytorch-max-series-resnet50v1-5-training.Dockerfile
+     - source: output/pytorch-max-series-resnet50v1-5-training.tar.gz
+       destination: model_packages/pytorch-max-series-resnet50v1-5-training.tar.gz
+     - source: ""
+       destination: info.txt
+     - source: third_party
+       destination: licenses/third_party
+     - source: LICENSE
+       destination: licenses/LICENSE
+    partials:
+    - intel-mpi-ccl
+    - model_package 
+    - entrypoint
+    
diff --git a/tools/docker/specs/tensorflow/base_spec.yml b/tools/docker/specs/tensorflow/base_spec.yml
index 549e6e45e..a0caa759c 100644
--- a/tools/docker/specs/tensorflow/base_spec.yml
+++ b/tools/docker/specs/tensorflow/base_spec.yml
@@ -38,7 +38,71 @@ releases:
         tag_specs: []
 
 slice_sets:
+    ubuntu:
+        - add_to_name: ""
+          partials:
+              - ubuntu-base
+          args:
+              - UBUNTU_VERSION=20.04
+
+    tensorflow-itex-gpu-base:
+        - add_to_name: ""
+          dockerfile_exclusive_name: tf-gpu
+          partials:
+              - tensorflow/gpu-base
+          files:
+               - source: LICENSE
+                 destination: licenses/LICENSE
+               - source: third_party
+                 destination: licenses/third_party
+  
+    tensorflow-itex-gpu-base-public:
+        - add_to_name: ""
+          dockerfile_exclusive_name: tf-atsm
+          partials:
+              - tensorflow/gpu-base-public
+          files:
+               - source: LICENSE
+                 destination: licenses/LICENSE
+               - source: third_party
+                 destination: licenses/third_party
+    
+    tensorflow-itex-max-series-base-public:
+        - add_to_name: ""
+          dockerfile_exclusive_name: tf-max-series
+          partials:
+              - tensorflow/max-series-base-public
+          files:
+               - source: LICENSE
+                 destination: licenses/LICENSE
+               - source: third_party
+                 destination: licenses/third_party
+                 
 
+    tensorflow-itex-max-series-base-public:
+        - add_to_name: ""
+          dockerfile_exclusive_name: tf-max-series
+          partials:
+              - tensorflow/max-series-base-public
+          files:
+               - source: LICENSE
+                 destination: licenses/LICENSE
+               - source: third_party
+                 destination: licenses/third_party
+                 
+    tf-ats-gpu:
+        - add_to_name: tf-ats-gpu 
+          args:
+              - PACKAGE_NAME=tf-ats-gpu
+              - DPCPP_COMPILER_BINARIES=l_dpcpp-cpp-compiler_p_2021.4.0.3201_offline.sh
+              - MKL_BINARY=l_onemkl_p_2021.3.0.533_offline.sh
+          partials:
+              - gpu-ats-drivers
+              - mkl
+              - local-compiler-dpcpp-cpp
+              - python3
+              - tensorflow/gpu-tensorflow
+    
     intel-tf:
         - add_to_name: ""
           dockerfile_exclusive_name: "intel-tf"
@@ -57,7 +121,7 @@ slice_sets:
               - openssh-7.6
               - horovod
           args:
-              - PY_VERSION=3.8
+              - PY_VERSION=3.9
 
     image-recognition:
         - add_to_name: "-image-recognition"
@@ -108,7 +172,7 @@ slice_sets:
               - object_detection/pip_installs
               - object_detection/protoc
           args:
-              - PY_VERSION=3.8
+              - PY_VERSION=3.9
           files:
                - source: LICENSE
                  destination: licenses/LICENSE
diff --git a/tools/docker/specs/tensorflow/tf-atsm-resnet50v1-5-inference_spec.yml b/tools/docker/specs/tensorflow/tf-atsm-resnet50v1-5-inference_spec.yml
new file mode 100644
index 000000000..676f97130
--- /dev/null
+++ b/tools/docker/specs/tensorflow/tf-atsm-resnet50v1-5-inference_spec.yml
@@ -0,0 +1,65 @@
+releases:
+  versioned:
+    tag_specs:
+      - "{tensorflow-itex-gpu-base-public}{tf-atsm-resnet50v1-5-inference}"
+slice_sets:
+  tf-atsm-resnet50v1-5-inference:
+    - add_to_name: -tf-atsm-resnet50v1-5-inference
+      dockerfile_exclusive_name: -resnet50v1-5-inference
+      args:
+        - PACKAGE_NAME=tf-atsm-resnet50v1-5-inference
+      dockerfile_subdirectory: gpu_model_containers
+      downloads:
+        - source: https://storage.googleapis.com/intel-optimized-tensorflow/models/gpu/resnet50v1_5_int8_h2d_avg_itex.pb
+          destination: pretrained_models/resnet50v1_5-frozen_graph-int8-gpu.pb
+      files:
+        - destination: benchmarks/common
+          source: benchmarks/common
+        - destination: benchmarks/image_recognition/__init__.py
+          source: benchmarks/image_recognition/__init__.py
+        - destination: benchmarks/image_recognition/tensorflow/__init__.py
+          source: benchmarks/image_recognition/tensorflow/__init__.py
+        - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md
+          source: benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md
+        - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/__init__.py
+          source: benchmarks/image_recognition/tensorflow/resnet50v1_5/__init__.py
+        - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/__init__.py
+          source: benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/__init__.py
+        - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/inference
+          source: benchmarks/image_recognition/tensorflow/resnet50v1_5/inference
+        - destination: benchmarks/launch_benchmark.py
+          source: benchmarks/launch_benchmark.py
+        - destination: models/common
+          source: models/common
+        - destination: models/image_recognition/tensorflow/resnet50v1_5/inference
+          source: models/image_recognition/tensorflow/resnet50v1_5/inference
+        - destination: models/image_recognition/tensorflow/resnet50v1_5
+          source: models/image_recognition/tensorflow/resnet50v1_5
+        - destination: quickstart/common
+          source: quickstart/common
+        - destination: quickstart
+          source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu
+        - source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/README.md
+          destination: README.md
+      wrapper_package_files:
+        - source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/wrapper_README.md
+          destination: README.md
+        - source: datasets/imagenet/README.md
+          destination: datasets/imagenet/README.md
+        - source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/build.sh
+          destination: build.sh
+        - source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/run.sh
+          destination: run.sh
+        - source: dockerfiles/gpu_model_containers/tf-atsm-resnet50v1-5-inference.Dockerfile
+          destination: tf-atsm-resnet50v1-5-inference.Dockerfile
+        - source: output/tf-atsm-resnet50v1-5-inference.tar.gz
+          destination: model_packages/tf-atsm-resnet50v1-5-inference.tar.gz
+        - source: ""
+          destination: info.txt
+        - source: third_party
+          destination: licenses/third_party
+        - source: LICENSE
+          destination: licenses/LICENSE
+      partials:
+        - model_package
+        - entrypoint
diff --git a/tools/docker/specs/tensorflow/tf-atsm-ssd-mobilenet-inference_spec.yml b/tools/docker/specs/tensorflow/tf-atsm-ssd-mobilenet-inference_spec.yml
new file mode 100644
index 000000000..21d2a0de6
--- /dev/null
+++ b/tools/docker/specs/tensorflow/tf-atsm-ssd-mobilenet-inference_spec.yml
@@ -0,0 +1,127 @@
+releases:
+  versioned:
+    tag_specs:
+      - "{tensorflow-itex-gpu-base-public}{tf-atsm-ssd-mobilenet-inference}"
+slice_sets:
+  tf-atsm-ssd-mobilenet-inference:
+    - add_to_name: -tf-atsm-ssd-mobilenet-inference
+      dockerfile_exclusive_name: -ssd-mobilenet-inference
+      args:
+        - PACKAGE_NAME=tf-atsm-ssd-mobilenet-inference
+      dockerfile_subdirectory: gpu_model_containers
+      documentation:
+        - docs:
+            - name: Title
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/title.md
+            - name: Description
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/description.md
+            - name: GPU Setup
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/gpu_setup.md
+            - name: Datasets
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/datasets.md
+            - name: Quick Start Scripts
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/quickstart.md
+            - name: Baremetal
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/baremetal.md
+            - name: License
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/license.md
+          name: README.md
+          text_replace:
+            <mode>: inference
+            <model name>: SSD-MobileNet
+            <model-precision-mode>: tf-atsm-ssd-mobilenet-inference
+            <use case>: object_detection
+            <device>: Intel(R) Data Center GPU Flex Series
+            <package name>: tf-atsm-ssd-mobilenet-inference.tar.gz
+            <package dir>: tf-atsm-ssd-mobilenet-inference
+            <package url>: ""
+          uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu
+        - docs:
+            - name: Title
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/title.md
+            - name: Description
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/description.md
+            - name: Model Package
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/package.md
+            - name: Datasets
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/datasets.md
+            - name: Quick Start Scripts
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/quickstart.md
+            - name: Docker
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/docker.md
+            - name: License
+              uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/.docs/license.md
+          name: wrapper_README.md
+          text_replace:
+            <mode>: inference
+            <model name>: SSD-MobileNet
+            <model-precision-mode>: tf-atsm-ssd-mobilenet-inference
+            <use case>: object_detection
+            <device>: Intel(R) Data Center GPU Flex Series
+            <package name>: tf-atsm-ssd-mobilenet-inference.tar.gz
+            <package dir>: tf-atsm-ssd-mobilenet-inference
+            <package url>: ""
+          uri: models/quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu
+      downloads:
+        - destination: pretrained_models/ssdmobilenet_int8_pretrained_model_gpu.pb
+          source: https://storage.googleapis.com/intel-optimized-tensorflow/models/gpu/ssd_mobilenet_v1_int8_itex.pb
+      files:
+        - destination: benchmarks/common
+          source: benchmarks/common
+        - destination: benchmarks/launch_benchmark.py
+          source: benchmarks/launch_benchmark.py
+        - destination: benchmarks/object_detection/__init__.py
+          source: benchmarks/object_detection/__init__.py
+        - destination: benchmarks/object_detection/tensorflow/__init__.py
+          source: benchmarks/object_detection/tensorflow/__init__.py
+        - destination: benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md
+          source: benchmarks/object_detection/tensorflow/ssd-mobilenet/README.md
+        - destination: benchmarks/object_detection/tensorflow/ssd-mobilenet/__init__.py
+          source: benchmarks/object_detection/tensorflow/ssd-mobilenet/__init__.py
+        - destination: benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/__init__.py
+          source: benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/__init__.py
+        - destination: benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8
+          source: benchmarks/object_detection/tensorflow/ssd-mobilenet/inference/int8
+        - destination: models/object_detection/tensorflow/ssd-mobilenet/inference/__init__.py
+          source: models/object_detection/tensorflow/ssd-mobilenet/inference/__init__.py
+        - destination: models/object_detection/tensorflow/ssd-mobilenet/inference/coco_detection_evaluator.py
+          source: models/object_detection/tensorflow/ssd-mobilenet/inference/coco_detection_evaluator.py
+        - destination: models/object_detection/tensorflow/ssd-mobilenet/inference/coco_label_map.py
+          source: models/object_detection/tensorflow/ssd-mobilenet/inference/coco_label_map.py
+        - destination: models/object_detection/tensorflow/ssd-mobilenet/inference/coco_tools.py
+          source: models/object_detection/tensorflow/ssd-mobilenet/inference/coco_tools.py
+        - destination: models/object_detection/tensorflow/ssd-mobilenet/inference/ssdmobilenet_preprocess.pb
+          source: models/object_detection/tensorflow/ssd-mobilenet/inference/ssdmobilenet_preprocess.pb
+        - destination: models/common
+          source: models/common
+        - destination: models/object_detection/tensorflow/ssd-mobilenet/inference/
+          source: models/object_detection/tensorflow/ssd-mobilenet/inference/
+        - destination: models/object_detection/tensorflow/ssd-mobilenet
+          source: models/object_detection/tensorflow/ssd-mobilenet
+        - destination: quickstart/common
+          source: quickstart/common
+        - destination: quickstart
+          source: quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/
+      wrapper_package_files:
+        - source: quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/wrapper_README.md
+          destination: README.md
+        - source: quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/build.sh
+          destination: build.sh
+        - source: quickstart/object_detection/tensorflow/ssd-mobilenet/inference/gpu/run.sh
+          destination: run.sh
+        - source: dockerfiles/gpu_model_containers/tf-atsm-ssd-mobilenet-inference.Dockerfile
+          destination: tf-atsm-ssd-mobilenet-inference.Dockerfile
+        - source: output/tf-atsm-ssd-mobilenet-inference.tar.gz
+          destination: model_packages/tf-atsm-ssd-mobilenet-inference.tar.gz
+        - source: ""
+          destination: info.txt
+        - source: third_party
+          destination: licenses/third_party
+        - source: LICENSE
+          destination: licenses/LICENSE
+      partials:
+        - numactl
+        - python-build
+        - object_detection/pip_installs
+        - model_package
+        - entrypoint
diff --git a/tools/docker/specs/tensorflow/tf-max-series-bert-large-inference_spec.yml b/tools/docker/specs/tensorflow/tf-max-series-bert-large-inference_spec.yml
new file mode 100644
index 000000000..b05116532
--- /dev/null
+++ b/tools/docker/specs/tensorflow/tf-max-series-bert-large-inference_spec.yml
@@ -0,0 +1,57 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{tensorflow-itex-max-series-base-public}{tf-max-series-bert-large-inference}'
+slice_sets:
+  tf-max-series-bert-large-inference:
+  - add_to_name: -tf-max-series-bert-large-inference
+    dockerfile_exclusive_name: -bert-large-inference
+    args:
+    - PACKAGE_NAME=tf-max-series-bert-large-inference
+    dockerfile_subdirectory: gpu_model_containers
+    downloads:
+    - source: https://storage.googleapis.com/intel-optimized-tensorflow/models/v2_7_0/fp32_bert_squad.pb
+      destination: frozen_graph/fp32_bert_squad.pb
+    files:
+    - destination: benchmarks/common
+      source: benchmarks/common
+    - destination: benchmarks/launch_benchmark.py
+      source: benchmarks/launch_benchmark.py
+    - destination: benchmarks/__init__.py
+      source: benchmarks/__init__.py
+    - destination: models/common
+      source: models/common
+    - destination: benchmarks/language_modeling/tensorflow/bert_large/inference
+      source: benchmarks/language_modeling/tensorflow/bert_large/inference
+    - destination: benchmarks/language_modeling/tensorflow/bert_large/__init__.py
+      source: benchmarks/language_modeling/tensorflow/bert_large/__init__.py
+    - destination: benchmarks/language_modeling/tensorflow/__init__.py
+      source: benchmarks/language_modeling/tensorflow/__init__.py
+    - destination: benchmarks/language_modeling/__init__.py
+      source: benchmarks/language_modeling/__init__.py
+    - destination: models/language_modeling/tensorflow/bert_large/inference
+      source: models/language_modeling/tensorflow/bert_large/inference
+    - destination: quickstart/common
+      source: quickstart/common
+    - destination: quickstart
+      source: quickstart/language_modeling/tensorflow/bert_large/inference/gpu
+    - source: quickstart/language_modeling/tensorflow/bert_large/inference/gpu/README.md
+      destination: README.md
+    wrapper_package_files:
+    - source: quickstart/language_modeling/tensorflow/bert_large/inference/gpu/benchmark.sh
+      destination: benchmark.sh
+    - source: quickstart/language_modeling/tensorflow/bert_large/inference/gpu/build.sh
+      destination: build.sh
+    - source: dockerfiles/gpu_model_containers/tf-max-series-bert-large-inference.Dockerfile
+      destination: tf-max-series-bert-large-inference.Dockerfile
+    - source: output/tf-max-series-bert-large-inference.tar.gz
+      destination: model_packages/tf-max-series-bert-large-inference.tar.gz
+    - source: ""
+      destination: info.txt
+    - source: third_party
+      destination: licenses/third_party
+    - source: LICENSE
+      destination: licenses/LICENSE
+    partials:
+    - model_package
+    - entrypoint
diff --git a/tools/docker/specs/tensorflow/tf-max-series-bert-large-training_spec.yml b/tools/docker/specs/tensorflow/tf-max-series-bert-large-training_spec.yml
new file mode 100644
index 000000000..9a580c312
--- /dev/null
+++ b/tools/docker/specs/tensorflow/tf-max-series-bert-large-training_spec.yml
@@ -0,0 +1,67 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{tensorflow-itex-max-series-base-public}{tf-max-series-bert-large-training}'
+slice_sets:
+  tf-max-series-bert-large-training:
+  - add_to_name: -tf-max-series-bert-large-training
+    dockerfile_exclusive_name: -bert-large-training
+    args:
+    - PACKAGE_NAME=tf-max-series-bert-large-training
+    dockerfile_subdirectory: gpu_model_containers
+    downloads: []
+    files:
+    - destination: benchmarks/common
+      source: benchmarks/common
+    - destination: benchmarks/language_modeling/tensorflow/bert_large/README.md
+      source: benchmarks/language_modeling/tensorflow/bert_large/README.md
+    - destination: benchmarks/__init__.py
+      source: benchmarks/__init__.py
+    - destination: benchmarks/language_modeling/__init__.py
+      source: benchmarks/language_modeling/__init__.py
+    - destination: benchmarks/language_modeling/tensorflow/__init__.py
+      source: benchmarks/language_modeling/tensorflow/__init__.py
+    - destination: benchmarks/language_modeling/tensorflow/bert_large/__init__.py
+      source: benchmarks/language_modeling/tensorflow/bert_large/__init__.py
+    - destination: benchmarks/language_modeling/tensorflow/bert_large/training
+      source: benchmarks/language_modeling/tensorflow/bert_large/training
+    - destination: benchmarks/launch_benchmark.py
+      source: benchmarks/launch_benchmark.py
+    - destination: models/common
+      source: models/common
+    - destination: models/language_modeling/tensorflow/bert_large/training
+      source: models/language_modeling/tensorflow/bert_large/training
+    - destination: quickstart/common
+      source: quickstart/common
+    - destination: quickstart
+      source: quickstart/language_modeling/tensorflow/bert_large/training/gpu
+    - destination: quickstart/language_modeling/tensorflow/bert_large/training/gpu/generate_pretraining_data.sh
+      source: quickstart/language_modeling/tensorflow/bert_large/training/gpu/generate_pretraining_data.sh
+    - source: quickstart/language_modeling/tensorflow/bert_large/training/gpu/README.md
+      destination: README.md
+    - source: third_party
+      destination: licenses/third_party
+    - source: LICENSE
+      destination: licenses/LICENSE
+    wrapper_package_files:
+    - source: quickstart/language_modeling/tensorflow/bert_large/training/gpu/bfloat16_training.sh
+      destination: bfloat16_training.sh
+    - source: quickstart/language_modeling/tensorflow/bert_large/training/gpu/bfloat16_training_hvd.sh
+      destination: bfloat16_training_hvd.sh
+    - source: quickstart/language_modeling/tensorflow/bert_large/training/gpu/build.sh
+      destination: build.sh
+    - source: dockerfiles/gpu_model_containers/tf-max-series-bert-large-training.Dockerfile
+      destination: tf-max-series-bert-large-training.Dockerfile
+    - source: output/tf-max-series-bert-large-training.tar.gz
+      destination: model_packages/tf-max-series-bert-large-training.tar.gz
+    - source: ""
+      destination: info.txt
+    - source: third_party
+      destination: licenses/third_party
+    - source: LICENSE
+      destination: licenses/LICENSE  
+    partials:
+    - intel-mpi-ccl
+    - model_package
+    - horovod-patch
+    - entrypoint
diff --git a/tools/docker/specs/tensorflow/tf-max-series-resnet50v1-5-inference_spec.yml b/tools/docker/specs/tensorflow/tf-max-series-resnet50v1-5-inference_spec.yml
new file mode 100644
index 000000000..27bec5b3b
--- /dev/null
+++ b/tools/docker/specs/tensorflow/tf-max-series-resnet50v1-5-inference_spec.yml
@@ -0,0 +1,71 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{tensorflow-itex-max-series-base-public}{tf-max-series-resnet50v1-5-inference}'
+slice_sets:
+  tf-max-series-resnet50v1-5-inference:
+  - add_to_name: -tf-max-series-resnet50v1-5-inference
+    dockerfile_exclusive_name: -resnet50v1-5-inference
+    args:
+    - PACKAGE_NAME=tf-max-series-resnet50v1-5-inference
+    dockerfile_subdirectory: gpu_model_containers
+    downloads: 
+      - source:  https://storage.googleapis.com/intel-optimized-tensorflow/models/gpu/resnet50_v1_int8.pb
+        destination: pretrained_models/resnet50v1_5-frozen_graph-int8-gpu.pb
+      - source: https://storage.googleapis.com/intel-optimized-tensorflow/models/gpu/resnet50_v1.pb
+        destination: pretrained_models/resnet50v1_5-frozen_graph-fp32-gpu.pb
+      - source: https://storage.googleapis.com/intel-optimized-tensorflow/models/gpu/resnet50_v1.pb
+        destination: pretrained_models/resnet50v1_5-frozen_graph-fp16-gpu.pb
+    files:
+      - destination: benchmarks/common
+        source: benchmarks/common
+      - destination: benchmarks/image_recognition/__init__.py
+        source: benchmarks/image_recognition/__init__.py
+      - destination: benchmarks/image_recognition/tensorflow/__init__.py
+        source: benchmarks/image_recognition/tensorflow/__init__.py
+      - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md
+        source: benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md
+      - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/__init__.py
+        source: benchmarks/image_recognition/tensorflow/resnet50v1_5/__init__.py
+      - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/__init__.py
+        source: benchmarks/image_recognition/tensorflow/resnet50v1_5/inference/__init__.py
+      - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/inference
+        source: benchmarks/image_recognition/tensorflow/resnet50v1_5/inference
+      - destination: benchmarks/launch_benchmark.py
+        source: benchmarks/launch_benchmark.py
+      - destination: models/common
+        source: models/common
+      - destination: models/image_recognition/tensorflow/resnet50v1_5/inference
+        source: models/image_recognition/tensorflow/resnet50v1_5/inference
+      - destination: models/image_recognition/tensorflow/resnet50v1_5
+        source: models/image_recognition/tensorflow/resnet50v1_5
+      - destination: quickstart/common
+        source: quickstart/common
+      - destination: quickstart
+        source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu
+      - source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/README_Max_Series.md
+        destination: README.md
+    wrapper_package_files:
+      - source: datasets/imagenet/README.md
+        destination: datasets/imagenet/README.
+      - source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/batch_inference.sh
+        destination: batch_inference.sh
+      - source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/online_inference.sh
+        destination: online_inference.sh
+      - source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/accuracy.sh
+        destination: accuracy.sh
+      - source: quickstart/image_recognition/tensorflow/resnet50v1_5/inference/gpu/build.sh
+        destination: build.sh
+      - source: dockerfiles/gpu_model_containers/tf-max-series-resnet50v1-5-inference.Dockerfile
+        destination: tf-max-series-resnet50v1-5-inference.Dockerfile
+      - source: output/tf-max-series-resnet50v1-5-inference.tar.gz
+        destination: model_packages/tf-max-series-resnet50v1-5-inference.tar.gz
+      - source: ""
+        destination: info.txt
+      - source: third_party
+        destination: licenses/third_party
+      - source: LICENSE
+        destination: licenses/LICENSE
+    partials:
+      - model_package
+      - entrypoint
diff --git a/tools/docker/specs/tensorflow/tf-max-series-resnet50v1-5-training_spec.yml b/tools/docker/specs/tensorflow/tf-max-series-resnet50v1-5-training_spec.yml
new file mode 100644
index 000000000..3a8cfd505
--- /dev/null
+++ b/tools/docker/specs/tensorflow/tf-max-series-resnet50v1-5-training_spec.yml
@@ -0,0 +1,66 @@
+releases:
+  versioned:
+    tag_specs:
+    - '{tensorflow-itex-max-series-base-public}{tf-max-series-resnet50v1-5-training}'
+slice_sets:
+  tf-max-series-resnet50v1-5-training:
+  - add_to_name: -tf-max-series-resnet50v1-5-training
+    dockerfile_exclusive_name: -resnet50v1-5-training
+    args:
+    - PACKAGE_NAME=tf-max-series-resnet50v1-5-training
+    dockerfile_subdirectory: gpu_model_containers 
+    files:
+    - destination: benchmarks/common
+      source: benchmarks/common
+    - destination: benchmarks/image_recognition/__init__.py
+      source: benchmarks/image_recognition/__init__.py
+    - destination: benchmarks/image_recognition/tensorflow/__init__.py
+      source: benchmarks/image_recognition/tensorflow/__init__.py
+    - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md
+      source: benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md
+    - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/__init__.py
+      source: benchmarks/image_recognition/tensorflow/resnet50v1_5/__init__.py
+    - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/training/__init__.py
+      source: benchmarks/image_recognition/tensorflow/resnet50v1_5/training/__init__.py
+    - destination: benchmarks/image_recognition/tensorflow/resnet50v1_5/training
+      source: benchmarks/image_recognition/tensorflow/resnet50v1_5/training
+    - destination: benchmarks/launch_benchmark.py
+      source: benchmarks/launch_benchmark.py
+    - destination: models/common
+      source: models/common
+    - destination: models/image_recognition/tensorflow/resnet50v1_5/training
+      source: models/image_recognition/tensorflow/resnet50v1_5/training
+    - destination: models/image_recognition/tensorflow/resnet50v1_5
+      source: models/image_recognition/tensorflow/resnet50v1_5
+    - destination: quickstart/common
+      source: quickstart/common
+    - destination: quickstart
+      source: quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu
+    - source: quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/README.md
+      destination: README.md
+    - source: datasets/imagenet
+      destination: datasets/imagenet
+    wrapper_package_files:
+    - source: datasets/imagenet/README.md
+      destination: datasets/imagenet/README.md
+    - source: quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/bfloat16_training_full.sh
+      destination: bfloat16_training_full.sh
+    - source: quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/bfloat16_training_hvd.sh
+      destination: bfloat16_training_hvd.sh
+    - source: quickstart/image_recognition/tensorflow/resnet50v1_5/training/gpu/build.sh
+      destination: build.sh
+    - source: dockerfiles/gpu_model_containers/tf-max-series-resnet50v1-5-training.Dockerfile
+      destination: tf-max-series-resnet50v1-5-training.Dockerfile
+    - source: output/tf-max-series-resnet50v1-5-training.tar.gz
+      destination: model_packages/tf-max-series-resnet50v1-5-training.tar.gz
+    - source: ""
+      destination: info.txt
+    - source: third_party
+      destination: licenses/third_party
+    - source: LICENSE
+      destination: licenses/LICENSE
+    partials:
+      - intel-mpi-ccl
+      - model_package
+      - entrypoint
+
diff --git a/tools/scripts/model-builder b/tools/scripts/model-builder
index 61906a709..19b6190c3 100755
--- a/tools/scripts/model-builder
+++ b/tools/scripts/model-builder
@@ -783,6 +783,7 @@ _model_builder.init_spec()
   _model_builder._echo_command ${_asm_new_spec} \
       --arg http_proxy=${http_proxy} \
       --arg https_proxy=${https_proxy} \
+      --arg no_proxy=${no_proxy} \
       --spec_dir=${_spec_dir_mounted}/${_current_framework} \
       --generate_new_spec=${_new_model} \
       --framework=${_current_framework} \
@@ -905,6 +906,7 @@ _model_builder.build()
     _model_builder._echo_command ${_asm_images} \
       --arg http_proxy=${http_proxy} \
       --arg https_proxy=${https_proxy} \
+      --arg no_proxy=${no_proxy} \
       --arg PACKAGE_DIR=${_model_package_folder} \
       --arg MODEL_WORKSPACE=${MODEL_WORKSPACE} \
       --spec_dir=${_spec_dir_mounted}/${_current_framework} \
@@ -1848,6 +1850,7 @@ _model_builder.run_test_suite()
           _model_builder._tests.build \
           --arg http_proxy=${http_proxy} \
           --arg https_proxy=${https_proxy} \
+          --arg no_proxy=${no_proxy} \
           --spec_dir=${_spec_dir_mounted}/${_current_framework} \
           --framework ${_current_framework} \
           ${_release_args[@]} \
@@ -1860,6 +1863,7 @@ _model_builder.run_test_suite()
           _model_builder._tests.generate_documentation \
           --arg http_proxy=${http_proxy} \
           --arg https_proxy=${https_proxy} \
+          --arg no_proxy=${no_proxy} \
           --spec_dir=${_spec_dir_mounted}/${_current_framework} \
           --framework ${_current_framework} \
           ${_release_args[@]} \
@@ -1872,7 +1876,8 @@ _model_builder.run_test_suite()
           _model_builder._tests.generate_dockerfile \
           --arg http_proxy=${http_proxy} \
           --arg https_proxy=${https_proxy} \
-          --spec_dir=${_spec_dir_mounted}/${_current_framework} \
+          --arg no_proxy=${no_proxy} \
+	      --spec_dir=${_spec_dir_mounted}/${_current_framework} \
           --framework ${_current_framework} \
           ${_release_args[@]} \
           ${_args[@]} \
@@ -1886,7 +1891,7 @@ _model_builder.run_test_suite()
           _model_builder._tests.generate_deployment \
           --arg http_proxy=${http_proxy} \
           --arg https_proxy=${https_proxy} \
-          --spec_dir=./specs/${_current_framework} \
+          --spec_dir=${_spec_dir_mounted}/${_current_framework} \
           --framework ${_current_framework} \
           ${_release_args[@]} \
           ${_args[@]} \
@@ -1898,6 +1903,7 @@ _model_builder.run_test_suite()
           _model_builder._tests.package \
           --arg http_proxy=${http_proxy} \
           --arg https_proxy=${https_proxy} \
+          --arg no_proxy=${no_proxy} \
           --spec_dir=${_spec_dir_mounted}/${_current_framework} \
           --framework ${_current_framework} \
           ${_release_args[@]} \
diff --git a/tox.ini b/tox.ini
index 777c71bae..cbbc0a658 100644
--- a/tox.ini
+++ b/tox.ini
@@ -21,7 +21,7 @@ addopts =
   -p no:warnings
   --cov=benchmarks/
   --cov-config=tox.ini
-  --cov-fail-under=87
+  --cov-fail-under=86
   --cov-report xml:test_data/coverage.xml
   --cov-report html:test_data/coverage_html_report
   --cov-report term-missing