Add test configurations for deterministic outputs on Dataflow (apache…

…#24325) * Add test configurations for deterministic outputs on Dataflow * Fix groovy syntax * Add machine config type to README * Fixup names and units for RunInference Benchmark tests * Change machine type to n1-standard-2
KYRDTeam · Nov 23, 2022 · 80311d0 · 80311d0
1 parent d2211bd
commit 80311d0
Show file tree

Hide file tree

Showing 3 changed files with 49 additions and 19 deletions.
diff --git a/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy b/.test-infra/jenkins/job_InferenceBenchmarkTests_Python.groovy
@@ -35,10 +35,13 @@ def loadTestConfigurations = {
         job_name              : 'benchmark-tests-pytorch-imagenet-python' + now,
         project               : 'apache-beam-testing',
         region                : 'us-central1',
+        machine_type          : 'n1-standard-2',
+        num_workers           : 75,
+        disk_size_gb          : 50,
+        autoscaling_algorithm : 'NONE',
         staging_location      : 'gs://temp-storage-for-perf-tests/loadtests',
         temp_location         : 'gs://temp-storage-for-perf-tests/loadtests',
         requirements_file     : 'apache_beam/ml/inference/torch_tests_requirements.txt',
-        experiments           : 'no_use_multiple_sdk_containers',
         publish_to_big_query  : true,
         metrics_dataset       : 'beam_run_inference',
         metrics_table         : 'torch_inference_imagenet_results_resnet101',
@@ -47,7 +50,7 @@ def loadTestConfigurations = {
         influx_db_name        : InfluxDBCredentialsHelper.InfluxDBDatabaseName,
         influx_hostname       : InfluxDBCredentialsHelper.InfluxDBHostUrl,
         pretrained_model_name : 'resnet101',
-        input_file                 : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt',
+        input_file            : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt',
         model_state_dict_path : 'gs://apache-beam-ml/models/torchvision.models.resnet101.pth',
         output                : 'gs://temp-storage-for-end-to-end-tests/torch/result_101' + now + '.txt'
       ]
@@ -60,10 +63,13 @@ def loadTestConfigurations = {
         job_name              : 'benchmark-tests-pytorch-imagenet-python' + now,
         project               : 'apache-beam-testing',
         region                : 'us-central1',
+        machine_type          : 'n1-standard-2',
+        num_workers           : 75,
+        disk_size_gb          : 50,
+        autoscaling_algorithm : 'NONE',
         staging_location      : 'gs://temp-storage-for-perf-tests/loadtests',
         temp_location         : 'gs://temp-storage-for-perf-tests/loadtests',
         requirements_file     : 'apache_beam/ml/inference/torch_tests_requirements.txt',
-        experiments           : 'no_use_multiple_sdk_containers',
         publish_to_big_query  : true,
         metrics_dataset       : 'beam_run_inference',
         metrics_table         : 'torch_inference_imagenet_results_resnet152',
@@ -72,7 +78,7 @@ def loadTestConfigurations = {
         influx_db_name        : InfluxDBCredentialsHelper.InfluxDBDatabaseName,
         influx_hostname       : InfluxDBCredentialsHelper.InfluxDBHostUrl,
         pretrained_model_name : 'resnet152',
-        input_file                 : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt',
+        input_file            : 'gs://apache-beam-ml/testing/inputs/openimage_50k_benchmark.txt',
         model_state_dict_path : 'gs://apache-beam-ml/models/torchvision.models.resnet152.pth',
         output                : 'gs://temp-storage-for-end-to-end-tests/torch/result_resnet152' + now + '.txt'
       ]
@@ -86,19 +92,21 @@ def loadTestConfigurations = {
         job_name              : 'benchmark-tests-pytorch-language-modeling-bert-base-uncased' + now,
         project               : 'apache-beam-testing',
         region                : 'us-central1',
+        machine_type          : 'n1-standard-2',
+        num_workers           : 250,
+        disk_size_gb          : 50,
+        autoscaling_algorithm : 'NONE',
         staging_location      : 'gs://temp-storage-for-perf-tests/loadtests',
         temp_location         : 'gs://temp-storage-for-perf-tests/loadtests',
         requirements_file     : 'apache_beam/ml/inference/torch_tests_requirements.txt',
-        pickle_library       : 'cloudpickle',
-        experiments           : 'no_use_multiple_sdk_containers',
         publish_to_big_query  : true,
         metrics_dataset       : 'beam_run_inference',
         metrics_table         : 'torch_language_modeling_bert_base_uncased',
         input_options         : '{}', // this option is not required for RunInference tests.
         influx_measurement    : 'torch_language_modeling_bert_base_uncased',
         influx_db_name        : InfluxDBCredentialsHelper.InfluxDBDatabaseName,
         influx_hostname       : InfluxDBCredentialsHelper.InfluxDBHostUrl,
-        input_file                 : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt',
+        input_file            : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt',
         bert_tokenizer        : 'bert-base-uncased',
         model_state_dict_path : 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-base-uncased.pth',
         output                : 'gs://temp-storage-for-end-to-end-tests/torch/result_bert_base_uncased' + now + '.txt',
@@ -112,19 +120,21 @@ def loadTestConfigurations = {
         job_name              : 'benchmark-tests-pytorch-language-modeling-bert-large-cased' + now,
         project               : 'apache-beam-testing',
         region                : 'us-central1',
+        machine_type          : 'n1-standard-2',
+        num_workers           : 250,
+        disk_size_gb          : 50,
+        autoscaling_algorithm : 'NONE',
         staging_location      : 'gs://temp-storage-for-perf-tests/loadtests',
         temp_location         : 'gs://temp-storage-for-perf-tests/loadtests',
         requirements_file     : 'apache_beam/ml/inference/torch_tests_requirements.txt',
-        pickle_library       : 'cloudpickle',
-        experiments           : 'no_use_multiple_sdk_containers',
         publish_to_big_query  : true,
         metrics_dataset       : 'beam_run_inference',
         metrics_table         : 'torch_language_modeling_bert_large_uncased',
         input_options         : '{}', // this option is not required for RunInference tests.
         influx_measurement    : 'torch_language_modeling_bert_large_uncased',
         influx_db_name        : InfluxDBCredentialsHelper.InfluxDBDatabaseName,
         influx_hostname       : InfluxDBCredentialsHelper.InfluxDBHostUrl,
-        input_file                 : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt',
+        input_file            : 'gs://apache-beam-ml/testing/inputs/sentences_50k.txt',
         bert_tokenizer        : 'bert-large-uncased',
         model_state_dict_path : 'gs://apache-beam-ml/models/huggingface.BertForMaskedLM.bert-large-uncased.pth',
         output                : 'gs://temp-storage-for-end-to-end-tests/torch/result_bert_large_uncased' + now + '.txt'

diff --git a/.../metrics/grafana/dashboards/perftests_metrics/Python_ML_RunInference_Benchmark_Tests.json b/.../metrics/grafana/dashboards/perftests_metrics/Python_ML_RunInference_Benchmark_Tests.json
@@ -424,7 +424,7 @@
       "steppedLine": false,
       "targets": [
         {
-          "alias": "mean_load_model_latency_milli_seconds_resnet101",
+          "alias": "mean_inference_batch_latency_resnet101",
           "groupBy": [
             {
               "params": [
@@ -462,7 +462,7 @@
           "tags": []
         },
         {
-          "alias": "mean_load_model_latency_milli_seconds_resnet_152",
+          "alias": "mean_inference_batch_latency_resnet_152",
           "groupBy": [
             {
               "params": [
@@ -593,7 +593,7 @@
       "steppedLine": false,
       "targets": [
         {
-          "alias": "$mean_inference_batch_latency_bert_base_uncased",
+          "alias": "mean_inference_batch_latency_bert_base_uncased",
           "groupBy": [
             {
               "params": [
@@ -762,7 +762,7 @@
       "steppedLine": false,
       "targets": [
         {
-          "alias": "mean_load_model_latency_milli_seconds_resnet101",
+          "alias": "mean_load_model_latency_resnet101",
           "groupBy": [
             {
               "params": [
@@ -800,7 +800,7 @@
           "tags": []
         },
         {
-          "alias": "mean_load_model_latency_milli_seconds_resnet_152",
+          "alias": "mean_load_model_latency_resnet_152",
           "groupBy": [
             {
               "params": [
@@ -931,7 +931,7 @@
       "steppedLine": false,
       "targets": [
         {
-          "alias": "mean_load_model_latency_milli_seconds_bert_base_uncased",
+          "alias": "mean_load_model_latency_bert_base_uncased",
           "groupBy": [
             {
               "params": [
@@ -969,7 +969,7 @@
           "tags": []
         },
         {
-          "alias": "mean_load_model_latency_milli_seconds_bert_large_uncased",
+          "alias": "mean_load_model_latency_bert_large_uncased",
           "groupBy": [
             {
               "params": [

diff --git a/sdks/python/apache_beam/testing/benchmarks/inference/README.md b/sdks/python/apache_beam/testing/benchmarks/inference/README.md
@@ -29,7 +29,7 @@ The Pytorch RunInference Image Classification 50K benchmark runs an
 [example image classification pipeline](https://github.com/apache/beam/blob/master/sdks/python/apache_beam/examples/inference/pytorch_image_classification.py)
 using various different resnet image classification models (the benchmarks on
 [Beam's dashboard](http://s.apache.org/beam-community-metrics/d/ZpS8Uf44z/python-ml-runinference-benchmarks?orgId=1)
-display [resnet101](https://huggingface.co/microsoft/resnet-101) and [resnet152](https://huggingface.co/microsoft/resnet-152))
+display [resnet101](https://pytorch.org/vision/main/models/generated/torchvision.models.resnet101.html) and [resnet152](https://pytorch.org/vision/stable/models/generated/torchvision.models.resnet152.html))
 against 50,000 example images from the OpenImage dataset. The benchmarks produce
 the following metrics:
 
@@ -38,6 +38,16 @@ the following metrics:
 - Mean Load Model Latency - the average amount of time it takes to load a model. This is done once per DoFn instance on worker
 startup, so the cost is amortized across the pipeline.
 
+Approximate size of the models used in the tests
+* resnet101: 170.5 MB
+* resnet152: 230.4 MB
+
+The above tests are configured to run using following configurations
+ * machine_type: n1-standard-2
+ * num_workers: 75
+ * autoscaling_algorithm: NONE
+ * disk_size_gb: 50
+
 ## Pytorch RunInference Language Modeling
 
 The Pytorch RunInference Language Modeling benchmark runs an
@@ -50,4 +60,14 @@ the following metrics:
 - Mean Inference Requested Batch Size - the average batch size that RunInference groups the images into for batch prediction
 - Mean Inference Batch Latency - the average amount of time it takes to perform inference on a given batch of images
 - Mean Load Model Latency - the average amount of time it takes to load a model. This is done once per DoFn instance on worker
-startup, so the cost is amortized across the pipeline.
+startup, so the cost is amortized across the pipeline.
+
+Approximate size of the models used in the tests
+* bert-base-uncased: 417.7 MB
+* bert-large-uncased: 1.2 GB
+
+The above tests are configured to run using following configurations
+ * machine_type: n1-standard-2
+ * num_workers: 250
+ * autoscaling_algorithm: NONE
+ * disk_size_gb: 75