opensearch-project · martin-gaievski · Jul 14, 2024 · Jul 19, 2024 · Jul 30, 2024 · Sep 9, 2024
@@ -0,0 +1,46 @@
+{
+  "settings": {
+    "index.number_of_shards": {{number_of_shards | default(1)}},
+    "index.number_of_replicas": {{number_of_replicas | default(0)}},
+    "index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}},
+    "index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}},
+    "index.merge.policy.max_merged_segment": "100GB",
+    "index.knn": true,
+    "default_pipeline": "nlp-ingest-pipeline"
+  },
+  "mappings": {
+    "dynamic": "true",
+    "_source": {
+      "enabled": {{ source_enabled | default(true) | tojson }}
+    },
+    "properties": {
+      "title": {
+        "type": "text"
+      },
+      "metadata": {
+        "type": "nested",
+        "properties": {
+          "url": {
+            "type": "text"
+          },
+          "pubmed_id": {
+            "type": "integer"
+          }
+        }
+      }, 
+      "passage_embedding": {
+        "type": "knn_vector",
+        "dimension": 768,
+        "method": {
+            "name": "hnsw",
+            "space_type": "innerproduct",
+            "engine": "faiss",
+            "parameters": {
+                "ef_construction": 256,
+                "m": 256
+            }
+        }
+      }
+    }
+  }
+}
@@ -0,0 +1,207 @@
+{
+    "name": "index",
+    "operation-type": "bulk",
+    "bulk-size": {{bulk_size | default(100)}},
+    "ingest-percentage": {{ingest_percentage | default(100)}}
+},
+{
+    "name": "delete-ingest-pipeline",
+    "operation-type": "delete-pipeline",
+    "id": "nlp-ingest-pipeline"
+  },
+  {
+    "name": "create-ingest-pipeline",
+    "operation-type": "put-pipeline",
+    "param-source": "create-ingest-pipeline",
+    "id": "nlp-ingest-pipeline",  
+    "body": {
+      "description": "An NLP ingest pipeline",
+      "processors": [
+        {
+          "text_embedding": {
+            "model_id": "",
+            "field_map": {
+              "title": "passage_embedding"
+            }
+          }
+        }
+      ]
+    }
+  },
+  {
+    "name": "index-append",
+    "operation-type": "bulk",
+    "bulk-size": {{bulk_size | default(100)}},
+    "ingest-percentage": {{ingest_percentage | default(100)}}
+  },
+  {
+    "name": "default",
+    "operation-type": "search",
+    "body": {
+      "query": {
+        "match_all": {}
+      }
+    }
+  },
+  {
+    "name": "semantic-search-neural",
+    "operation-type": "search",
+    "variable-queries": {{variable_queries | default(0)}},
+    "param-source": "semantic-search-neural-source",
+    "body": {
+      "_source": {
+        "excludes": [
+          "passage_embedding"
+        ]
+      },
+      "query": {
+        "neural": {
+          "passage_embedding": {
+            "query_text": "what types of rapid testing for Covid-19 have been developed?",
+            "model_id": "",
+            "k": {{k | default(10)}}
+          }
+        }
+      }
+    }
+  },
+  {
+    "name": "create-normalization-processor-no-weights-search-pipeline",
+    "operation-type": "create-search-pipeline",
+    "id": "nlp-min-max-arithmetic-search-pipeline",
+    "body": {
+      "description": "Post processor for hybrid search with min_max normalization and arithmetic_mean combination",
+      "phase_results_processors": [
+        {
+            "normalization-processor": {
+                "normalization": {
+                    "technique": "min_max"
+                },
+                "combination": {
+                    "technique": "arithmetic_mean"
+                }
+            }
+        }
+      ]
+    }
+  },
+  {
+    "name": "semantic-search-hybrid-bm25-and-neural-search",
+    "operation-type": "search",
+    "request-params": {
+      "search_pipeline": "nlp-min-max-arithmetic-search-pipeline"
+    },
+    "variable-queries": {{variable_queries | default(0)}},
+    "param-source": "hybrid-query-bm25-neural-search-source",
+    "body": {
+      "_source": {
+        "excludes": [
+          "passage_embedding"
+        ]
+      },
+      "query": {
+        "hybrid": {
+          "queries": [
+            {
+              "match": {
+                "title": ""
+              }
+            },
+            {
+              "neural": {
+                "passage_embedding": {
+                  "query_text": "what types of rapid testing for Covid-19 have been developed?",
+                  "model_id": "",
+                  "k": {{k | default(10)}}
+                }
+              }
+            }
+          ] 
+        }
+      }
+    }
+  },
+  {
+    "name": "semantic-search-hybrid-bm25-and-knn-search",
+    "operation-type": "search",
+    "request-params": {
+      "search_pipeline": "nlp-min-max-arithmetic-search-pipeline"
+    },
+    "variable-queries": {{variable_queries | default(0)}},
+    "param-source": "hybrid-query-bm25-knn-search-source",
+    "body": {
+      "_source": {
+        "excludes": [
+          "passage_embedding"
+        ]
+      },
+      "query": {
+        "hybrid": {
+          "queries": [
+            {
+              "match": {
+                "title": ""
+              }
+            },
+            {
+              "knn": {
+                "passage_embedding": {
+                  "vector": "[1, 2, 3]",
+                  "k": {{k | default(100)}}
+                }
+              }
+            }
+          ] 
+        }
+      }
+    }
+  },
+  {
+    "name": "semantic-search-hybrid-bm25-range-and-neural-search",
+    "operation-type": "search",
+    "request-params": {
+      "search_pipeline": "nlp-min-max-arithmetic-search-pipeline"
+    },
+    "variable-queries": {{variable_queries | default(0)}},
+    "param-source": "hybrid-query-bm25-neural-search-source",
+    "body": {
+      "_source": {
+        "excludes": [
+          "passage_embedding"
+        ]
+      },
+      "query": {
+        "hybrid": {
+          "queries": [
+            {
+              "match": {
+                "title": ""
+              }
+            },
+            {
+              "neural": {
+                "passage_embedding": {
+                  "query_text": "what types of rapid testing for Covid-19 have been developed?",
+                  "model_id": "",
+                  "k": {{k | default(10)}}
+                }
+              }
+            },
+            {
+              "nested": {
+                "path": "metadata",
+                  "query": {
+                    "range": {
+                      "metadata.pubmed_id": {
+                        "gte": {{range_gte | default(100)}},
+                        "lte": {{range_lte | default(10000000)}}
+                      }
+                    }
+                  }
+              }
+            }
+          ] 
+        }
+      }
+    }
+  }
@@ -0,0 +1,13 @@
+{
+    "bulk_indexing_clients": 4,
+    "bulk_size": 200,
+    "number_of_replicas": 1,
+    "number_of_shards" :8,
+    "ingest_percentage":100,
+    "search_clients": 8,
+    "warmup_iterations": 20,
+    "iterations": 100,
+    "variable_queries": 50,
+    "k": 100,
+    "only_run_on_ml_node" : "false"
+}
@@ -0,0 +1,14 @@
+{
+    "bulk_indexing_clients": 4,
+    "bulk_size": 200,
+    "number_of_replicas": 1,
+    "number_of_shards" :8,
+    "ingest_percentage":100,
+    "search_clients": 8,
+    "warmup_iterations": 20,
+    "iterations": 100,
+    "variable_queries": 50,
+    "k": 100,
+    "only_run_on_ml_node" : "false",
+    "concurent_segment_search_enabled": "true"
+}
@@ -0,0 +1,13 @@
+{
+    "bulk_indexing_clients": 4,
+    "bulk_size": 200,
+    "number_of_replicas": 1,
+    "number_of_shards" :8,
+    "ingest_percentage":100,
+    "search_clients": 8,
+    "warmup_iterations": 20,
+    "iterations": 100,
+    "variable_queries": 50,
+    "k": 100,
+    "only_run_on_ml_node" : "true"
+}
@@ -0,0 +1,14 @@
+{
+    "bulk_indexing_clients": 4,
+    "bulk_size": 200,
+    "number_of_replicas": 1,
+    "number_of_shards" :8,
+    "ingest_percentage":100,
+    "search_clients": 8,
+    "warmup_iterations": 20,
+    "iterations": 100,
+    "variable_queries": 50,
+    "k": 100,
+    "only_run_on_ml_node" : "true",
+    "concurent_segment_search_enabled": "true"
+}
@@ -0,0 +1,38 @@
+# SPDX-License-Identifier: Apache-2.0
+#
+# The OpenSearch Contributors require contributions made to
+# this file be licensed under the Apache-2.0 license or a
+# compatible open source license.
+
+from osbenchmark.worker_coordinator.runner import Retry, Runner
+from osbenchmark.client import RequestContextHolder
+
+# This runner class and registration is a temporary workaround while the next version of OSB is pending release
+def register(registry):
+    registry.register_runner(
+        UpdateConcurrentSegmentSearchSettings.RUNNER_NAME,
+                    Retry(UpdateConcurrentSegmentSearchSettings()), async_runner=True
+    )
+
+request_context_holder = RequestContextHolder()
+
+class UpdateConcurrentSegmentSearchSettings(Runner):
+
+    RUNNER_NAME = "update-concurrent-segment-search-settings"
+
+    async def __call__(self, opensearch, params):
+        enable_setting = params.get("enable", "false")
+        max_slice_count = params.get("max_slice_count", None)
+        body = {
+            "persistent": {
+                "search.concurrent_segment_search.enabled": enable_setting
+            }
+        }
+        if max_slice_count is not None:
+            body["persistent"]["search.concurrent.max_slice_count"] = max_slice_count
+        request_context_holder.on_client_request_start()
+        await opensearch.cluster.put_settings(body=body)
+        request_context_holder.on_client_request_end()
+
+    def __repr__(self, *args, **kwargs):
+        return self.RUNNER_NAME
@@ -0,0 +1,18 @@
+{
+    "operation": {
+      "operation-type": "delete-ml-model",
+      "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}"
+    }
+  },
+  {
+    "operation": {
+      "operation-type": "register-ml-model",
+      "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}",
+      "model-version": "{{ model_version | default('1.0.1') }}",
+      "model-format": "{{ model_format | default('TORCH_SCRIPT') }}",
+      "model-config-file": "{{ model_config_file | default('') }}"
+    }
+  },
+  {
+    "operation": "deploy-ml-model"
+  }
@@ -0,0 +1,24 @@
+{
+    "operation": "semantic-search-neural",
+    "warmup-iterations": {{warmup_iterations | default(50) | tojson}},
+    "iterations": {{iterations | default(100) | tojson }},
+    "clients": {{ search_clients | default(1)}}
+  },
+  {
+    "operation": "semantic-search-hybrid-bm25-and-knn-search",
+    "warmup-iterations": {{warmup_iterations | default(50) | tojson}},
+    "iterations": {{iterations | default(100) | tojson }},
+    "clients": {{ search_clients | default(1)}}
+  },
+  {
+    "operation": "semantic-search-hybrid-bm25-and-neural-search",
+    "warmup-iterations": {{warmup_iterations | default(50) | tojson}},
+    "iterations": {{iterations | default(100) | tojson }},
+    "clients": {{ search_clients | default(1)}}
+  },
+  {
+    "operation": "semantic-search-hybrid-bm25-range-and-neural-search",
+    "warmup-iterations": {{warmup_iterations | default(50) | tojson}},
+    "iterations": {{iterations | default(100) | tojson }},
+    "clients": {{ search_clients | default(1)}}
+  }