From 203fb92a0f3f02fdf9cb7521e73cb5b6284d97ac Mon Sep 17 00:00:00 2001 From: Martin Gaievski Date: Sun, 14 Jul 2024 22:03:52 +0000 Subject: [PATCH 1/4] Adding semantic search workload that includes vector and bm25 search Signed-off-by: Martin Gaievski --- trec_covid_semantic_search/README.md | 265 ++++++++++++++++++ trec_covid_semantic_search/index.json | 46 +++ .../operations/default.json | 207 ++++++++++++++ trec_covid_semantic_search/params/params.json | 12 + .../test_procedures/procedures.json | 168 +++++++++++ trec_covid_semantic_search/workload.json | 30 ++ trec_covid_semantic_search/workload.py | 186 ++++++++++++ .../workload_queries_knn.json | 6 + 8 files changed, 920 insertions(+) create mode 100644 trec_covid_semantic_search/README.md create mode 100644 trec_covid_semantic_search/index.json create mode 100644 trec_covid_semantic_search/operations/default.json create mode 100644 trec_covid_semantic_search/params/params.json create mode 100644 trec_covid_semantic_search/test_procedures/procedures.json create mode 100644 trec_covid_semantic_search/workload.json create mode 100644 trec_covid_semantic_search/workload.py create mode 100644 trec_covid_semantic_search/workload_queries_knn.json diff --git a/trec_covid_semantic_search/README.md b/trec_covid_semantic_search/README.md new file mode 100644 index 00000000..6374cdf6 --- /dev/null +++ b/trec_covid_semantic_search/README.md @@ -0,0 +1,265 @@ +# Semantic Search Workload + +This workload is aimed to benchmark performance of Semantic Search queries. Ingested documents will have embeddings that are generated during ingestion process by pre-trained local model. + +## Datasets + +We usae processed version of trec-covid dataset. Trec-Covid is a dataset collection of documents about COVID-19 information. + +- Trec-Covid website: https://ir.nist.gov/covidSubmit/index.html +- Dataset: https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/trec-covid.zip + +We processed the dataset by creating 6 copies of the same document and shuffle copies so they are ingested in random order. We create custom artifact for queries by extracting queries portion from original `trec-covid` dataset and generating vector embeddings for query text using 768 dimension vector, same dimensions that used for document ingestion. + +### Example Document + +Following is example of document that is beeing ingested during indexing: + +```json +{ + "title": "Simultaneous Video-EEG-ECG Monitoring to Identify Neurocardiac Dysfunction in Mouse Models of Epilepsy.", + "metadata": { + "url": "https://doi.org/10.3791/57300; https://www.ncbi.nlm.nih.gov/pubmed/29443088/", + "pubmed_id": "29443088" + } +} +``` + +Following is example of query: + +```json +{ + "_id": "1", + "query": "what is the origin of COVID-19", + "vector_embedding": [ + -0.06979332, + 0.05764826, + ... + ] +} + +``` + +## Parameters + +This workload allows the following parameters to be specified using `--workload-params`: + +* `bulk_size` (default: 100) +* `bulk_indexing_clients` (default: 1): Number of clients that issue bulk indexing requests. +* `ingest_percentage` (default: 100): A number between 0 and 100 that defines how much of the document corpus should be ingested. +* `number_of_replicas` (default: 0) +* `number_of_shards` (default: 1) +* `query_cache_enabled` (default: false) +* `requests_cache_enabled` (default: false) +* `source_enabled` (default: true): A boolean defining whether the `_source` field is stored in the index. +* `force_merge_max_num_segments` (default: unset): An integer specifying the max amount of segments the force-merge operation should use. +* `index_settings`: A list of index settings. Index settings defined elsewhere (e.g. `number_of_replicas`) need to be overridden explicitly. +* `cluster_health` (default: "green"): The minimum required cluster health. +* `error_level` (default: "non-fatal"): Available for bulk operations only to specify ignore-response-error-level. +* `target_throughput` (default: default values for each operation): Number of requests per second, `""` for no limit. +* `search_clients`: Number of clients that issue search requests. +* `model_name` (default: huggingface/sentence-transformers/all-mpnet-base-v2) OpenSearch-provided pretrained model name. +* `model_version` (default: 1.0.1) Model version. +* `model_format` (default: TORCH_SCRIPT) Model format. +* `dimensions` (default: 768): Vector dimensions, needed to match the model. +* `engine` (default:` lucene): The approximate k-NN library to use for indexing and search. +* `method` (default:` hnsw): K-NN search algorithm. +* `space_type` (default:` l2): The vector space used to calculate the distance between vectors. +* `k` (default: 10) Number of nearest neighbors are returned. +* `warmup_iterations` Number of Warmup iteration of each search client executes. +* `iterations` Number of test iterations of each search client executes. +* `num_variable_queries` (default: 0) Number of variable queries will be used for the semantic search task, 0 means fixed query and max value is 50. +* `range_gte` (default: 100) Number that defines the lower bound (inclusive) for range query when it's used as elemnts in semantic search query +* `range_lte` (default: 10000000) Number that defines the upper bound (inclusive) for range query when it's used as elemnts in semantic search query + +### Running a benchmark + +Before running a benchmark, ensure that the load generation host is able to access your cluster endpoint and that the +appropriate dataset is available on the host. + +Currently, we support 2 test procedures for the semantic search workload. The default procedure is `create-index-ingest-data-search` and does create an index, ingest data and run a base set of search queries. + +To run the default workload, invoke the following command. + +``` +# OpenSearch Cluster End point url with hostname and port +export ENDPOINT= +# Absolute file path of Workload file +export WORKLOAD_PATH= + +opensearch-benchmark execute-test \ + --workload-path=$WORKLOAD_PATH \ + --workload-params="/trec_covid_semantic_search/params/params.json" \ + --pipeline=benchmark-only \ + --target-host=$ENDPOINT \ + --kill-running-processes \ + --test-procedure="search" +``` + +## Current Procedures + +### Create index with data + +This procedure creates index, deploy model localy, creaes pipeline with ingest and search processors and ingest documents. At the end we ran the match_all query that returns all documents in the index. +Procedure name `create-index-ingest-data-search`. +This is a default precedure for this workload. + +### Run semantic search queries + +This search procedure runs semantic search queries: neural, hybrid. It deletes and deploys an ml model and creates processor and uses this model to generate search specific embeddings. +Procedure name `search`. + +#### Sample Output + +The output of a sample test run is provided below. Metrics are captured in the result's data store as usual, and this can be configured to be +either in-memory, or an external OpenSearch cluster. + +``` + + ____ _____ __ ____ __ __ + / __ \____ ___ ____ / ___/___ ____ ___________/ /_ / __ )___ ____ _____/ /_ ____ ___ ____ ______/ /__ + / / / / __ \/ _ \/ __ \\__ \/ _ \/ __ `/ ___/ ___/ __ \ / __ / _ \/ __ \/ ___/ __ \/ __ `__ \/ __ `/ ___/ //_/ +/ /_/ / /_/ / __/ / / /__/ / __/ /_/ / / / /__/ / / / / /_/ / __/ / / / /__/ / / / / / / / / /_/ / / / ,< +\____/ .___/\___/_/ /_/____/\___/\__,_/_/ \___/_/ /_/ /_____/\___/_/ /_/\___/_/ /_/_/ /_/ /_/\__,_/_/ /_/|_| + /_/ + +[INFO] [Test Execution ID]: 3ff68a05-9aa7-4375-966e-8e686a8d14d3 +[INFO] Executing test with workload [trec_covid_semantic_search], test_procedure [search] and provision_config_instance ['external'] with version [2.15.0]. + +[WARNING] merges_total_time is 76929 ms indicating that the cluster is not in a defined clean state. Recorded index time metrics may be misleading. +[WARNING] merges_total_throttled_time is 56405 ms indicating that the cluster is not in a defined clean state. Recorded index time metrics may be misleading. +[WARNING] indexing_total_time is 1884805 ms indicating that the cluster is not in a defined clean state. Recorded index time metrics may be misleading. +[WARNING] refresh_total_time is 944585 ms indicating that the cluster is not in a defined clean state. Recorded index time metrics may be misleading. +[WARNING] flush_total_time is 1674638 ms indicating that the cluster is not in a defined clean state. Recorded index time metrics may be misleading. +Running delete-ml-model [100% done] +Running register-ml-model [100% done] +Running deploy-ml-model [100% done] +Running create-normalization-processor-no-weights-search-pipeline [100% done] +Running semantic-search-neural [100% done] +Running semantic-search-hybrid-bm25-and-knn-search [100% done] +Running semantic-search-hybrid-bm25-and-neural-search [100% done] +Running semantic-search-hybrid-bm25-range-and-neural-search [100% done] + +------------------------------------------------------ + _______ __ _____ + / ____(_)___ ____ _/ / / ___/_________ ________ + / /_ / / __ \/ __ `/ / \__ \/ ___/ __ \/ ___/ _ \ + / __/ / / / / / /_/ / / ___/ / /__/ /_/ / / / __/ +/_/ /_/_/ /_/\__,_/_/ /____/\___/\____/_/ \___/ +------------------------------------------------------ + +| Metric | Task | Value | Unit | +|---------------------------------------------------------------:|----------------------------------------------------:|----------:|-------:| +| Cumulative indexing time of primary shards | | 31.5392 | min | +| Min cumulative indexing time across primary shards | | 0 | min | +| Median cumulative indexing time across primary shards | | 2.65797 | min | +| Max cumulative indexing time across primary shards | | 6.65067 | min | +| Cumulative indexing throttle time of primary shards | | 0 | min | +| Min cumulative indexing throttle time across primary shards | | 0 | min | +| Median cumulative indexing throttle time across primary shards | | 0 | min | +| Max cumulative indexing throttle time across primary shards | | 0 | min | +| Cumulative merge time of primary shards | | 1.49212 | min | +| Cumulative merge count of primary shards | | 66 | | +| Min cumulative merge time across primary shards | | 0 | min | +| Median cumulative merge time across primary shards | | 0.02405 | min | +| Max cumulative refresh time across primary shards | | 3.16232 | min | +| Cumulative flush time of primary shards | | 27.9703 | min | +| Cumulative flush count of primary shards | | 43 | | +| Min cumulative flush time across primary shards | | 0 | min | +| Median cumulative flush time across primary shards | | 2.21 | min | +| Max cumulative flush time across primary shards | | 5.80563 | min | +| Total Young Gen GC time | | 0.26 | s | +| Total Young Gen GC count | | 9 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 30.2634 | GB | +| Translog size | | 0.0721771 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Median cumulative flush time across primary shards | | 2.21 | min | +| Max cumulative flush time across primary shards | | 5.80563 | min | +| Total Young Gen GC time | | 0.26 | s | +| Total Young Gen GC count | | 9 | | +| Total Old Gen GC time | | 0 | s | +| Total Old Gen GC count | | 0 | | +| Store size | | 30.2634 | GB | +| Translog size | | 0.0721771 | GB | +| Heap used for segments | | 0 | MB | +| Heap used for doc values | | 0 | MB | +| Heap used for terms | | 0 | MB | +| Heap used for norms | | 0 | MB | +| Heap used for points | | 0 | MB | +| Heap used for stored fields | | 0 | MB | +| Segment count | | 222 | | +| Min Throughput | semantic-search-neural | 25.58 | ops/s | +| Mean Throughput | semantic-search-neural | 32.28 | ops/s | +| Median Throughput | semantic-search-neural | 33.23 | ops/s | +| Max Throughput | semantic-search-neural | 34.79 | ops/s | +| 50th percentile latency | semantic-search-neural | 210.864 | ms | +| 90th percentile latency | semantic-search-neural | 232.103 | ms | +| 99th percentile latency | semantic-search-neural | 259.537 | ms | +| 100th percentile latency | semantic-search-neural | 287.864 | ms | +| 50th percentile service time | semantic-search-neural | 210.864 | ms | +| 90th percentile service time | semantic-search-neural | 232.103 | ms | +| 99th percentile service time | semantic-search-neural | 259.537 | ms | +| 100th percentile service time | semantic-search-neural | 287.864 | ms | +| error rate | semantic-search-neural | 0 | % | +| Min Throughput | semantic-search-hybrid-bm25-and-knn-search | 67.79 | ops/s | +| Mean Throughput | semantic-search-hybrid-bm25-and-knn-search | 71.87 | ops/s | +| Median Throughput | semantic-search-hybrid-bm25-and-knn-search | 72.71 | ops/s | +| Max Throughput | semantic-search-hybrid-bm25-and-knn-search | 73.51 | ops/s | +| 50th percentile latency | semantic-search-hybrid-bm25-and-knn-search | 103.806 | ms | +| 90th percentile latency | semantic-search-hybrid-bm25-and-knn-search | 111.644 | ms | +| 99th percentile latency | semantic-search-hybrid-bm25-and-knn-search | 118.395 | ms | +| 100th percentile latency | semantic-search-hybrid-bm25-and-knn-search | 122.929 | ms | +| 50th percentile service time | semantic-search-hybrid-bm25-and-knn-search | 103.806 | ms | +| 90th percentile service time | semantic-search-hybrid-bm25-and-knn-search | 111.644 | ms | +| 99th percentile service time | semantic-search-hybrid-bm25-and-knn-search | 118.395 | ms | +| 100th percentile service time | semantic-search-hybrid-bm25-and-knn-search | 122.929 | ms | +| error rate | semantic-search-hybrid-bm25-and-knn-search | 0 | % | +| Min Throughput | semantic-search-hybrid-bm25-and-neural-search | 35.59 | ops/s | +| Mean Throughput | semantic-search-hybrid-bm25-and-neural-search | 36.28 | ops/s | +| Median Throughput | semantic-search-hybrid-bm25-and-neural-search | 36.34 | ops/s | +| Max Throughput | semantic-search-hybrid-bm25-and-neural-search | 36.63 | ops/s | +| 50th percentile latency | semantic-search-hybrid-bm25-and-neural-search | 213.2 | ms | +| 90th percentile latency | semantic-search-hybrid-bm25-and-neural-search | 232.455 | ms | +| 99th percentile latency | semantic-search-hybrid-bm25-and-neural-search | 265.864 | ms | +| 100th percentile latency | semantic-search-hybrid-bm25-and-neural-search | 300.295 | ms | +| 50th percentile service time | semantic-search-hybrid-bm25-and-neural-search | 213.2 | ms | +| 90th percentile service time | semantic-search-hybrid-bm25-and-neural-search | 232.455 | ms | +| 99th percentile service time | semantic-search-hybrid-bm25-and-neural-search | 265.864 | ms | +| 100th percentile service time | semantic-search-hybrid-bm25-and-neural-search | 300.295 | ms | +| error rate | semantic-search-hybrid-bm25-and-neural-search | 0 | % | +| Min Throughput | semantic-search-hybrid-bm25-range-and-neural-search | 34.65 | ops/s | +| Mean Throughput | semantic-search-hybrid-bm25-range-and-neural-search | 35.98 | ops/s | +| Median Throughput | semantic-search-hybrid-bm25-range-and-neural-search | 36.22 | ops/s | +| Max Throughput | semantic-search-hybrid-bm25-range-and-neural-search | 36.38 | ops/s | +| 50th percentile latency | semantic-search-hybrid-bm25-range-and-neural-search | 214.191 | ms | +| 90th percentile latency | semantic-search-hybrid-bm25-range-and-neural-search | 234.587 | ms | +| 99th percentile latency | semantic-search-hybrid-bm25-range-and-neural-search | 259.207 | ms | +| 100th percentile latency | semantic-search-hybrid-bm25-range-and-neural-search | 276.345 | ms | +| 50th percentile service time | semantic-search-hybrid-bm25-range-and-neural-search | 214.191 | ms | +| 90th percentile service time | semantic-search-hybrid-bm25-range-and-neural-search | 234.587 | ms | +| 99th percentile service time | semantic-search-hybrid-bm25-range-and-neural-search | 259.207 | ms | +| 100th percentile service time | semantic-search-hybrid-bm25-range-and-neural-search | 276.345 | ms | +| error rate | semantic-search-hybrid-bm25-range-and-neural-search | 0 | % | + + +--------------------------------- +[INFO] SUCCESS (took 174 seconds) +--------------------------------- +``` + +## License + +Following license used by original dataset and we're using it too. +``` + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ +``` +Covid-trec [1] is part of the COVID-19 Open Research dataset [2], which is licensed under Apache 2.0. +[1] https://arxiv.org/pdf/2005.04474v1.pdf +[2] https://github.com/allenai/cord19/ diff --git a/trec_covid_semantic_search/index.json b/trec_covid_semantic_search/index.json new file mode 100644 index 00000000..b3a10a2b --- /dev/null +++ b/trec_covid_semantic_search/index.json @@ -0,0 +1,46 @@ +{ + "settings": { + "index.number_of_shards": {{number_of_shards | default(1)}}, + "index.number_of_replicas": {{number_of_replicas | default(0)}}, + "index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}}, + "index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}}, + "index.merge.policy.max_merged_segment": "100GB", + "index.knn": true, + "default_pipeline": "nlp-ingest-pipeline" + }, + "mappings": { + "dynamic": "true", + "_source": { + "enabled": {{ source_enabled | default(true) | tojson }} + }, + "properties": { + "title": { + "type": "text" + }, + "metadata": { + "type": "nested", + "properties": { + "url": { + "type": "text" + }, + "pubmed_id": { + "type": "integer" + } + } + }, + "passage_embedding": { + "type": "knn_vector", + "dimension": 768, + "method": { + "name": "hnsw", + "space_type": "innerproduct", + "engine": "faiss", + "parameters": { + "ef_construction": 256, + "m": 256 + } + } + } + } + } +} diff --git a/trec_covid_semantic_search/operations/default.json b/trec_covid_semantic_search/operations/default.json new file mode 100644 index 00000000..73048ba9 --- /dev/null +++ b/trec_covid_semantic_search/operations/default.json @@ -0,0 +1,207 @@ +{ + "name": "index", + "operation-type": "bulk", + "bulk-size": {{bulk_size | default(100)}}, + "ingest-percentage": {{ingest_percentage | default(100)}} +}, +{ + "name": "delete-ingest-pipeline", + "operation-type": "delete-pipeline", + "id": "nlp-ingest-pipeline" + }, + { + "name": "create-ingest-pipeline", + "operation-type": "put-pipeline", + "param-source": "create-ingest-pipeline", + "id": "nlp-ingest-pipeline", + "body": { + "description": "An NLP ingest pipeline", + "processors": [ + { + "text_embedding": { + "model_id": "", + "field_map": { + "title": "passage_embedding" + } + } + } + ] + } + }, + { + "name": "index-append", + "operation-type": "bulk", + "bulk-size": {{bulk_size | default(100)}}, + "ingest-percentage": {{ingest_percentage | default(100)}} + }, + { + "name": "default", + "operation-type": "search", + "body": { + "query": { + "match_all": {} + } + } + }, + { + "name": "semantic-search-neural", + "operation-type": "search", + "variable-queries": {{variable_queries | default(0)}}, + "param-source": "semantic-search-neural-source", + "body": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "neural": { + "passage_embedding": { + "query_text": "what types of rapid testing for Covid-19 have been developed?", + "model_id": "", + "k": {{k | default(10)}} + } + } + } + } + }, + { + "name": "create-normalization-processor-no-weights-search-pipeline", + "operation-type": "create-search-pipeline", + "id": "nlp-min-max-arithmetic-search-pipeline", + "body": { + "description": "Post processor for hybrid search with min_max normalization and arithmetic_mean combination", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean" + } + } + } + ] + } + }, + { + "name": "semantic-search-hybrid-bm25-and-neural-search", + "operation-type": "search", + "request-params": { + "search_pipeline": "nlp-min-max-arithmetic-search-pipeline" + }, + "variable-queries": {{variable_queries | default(0)}}, + "param-source": "hybrid-query-bm25-neural-search-source", + "body": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match": { + "title": "" + } + }, + { + "neural": { + "passage_embedding": { + "query_text": "what types of rapid testing for Covid-19 have been developed?", + "model_id": "", + "k": {{k | default(10)}} + } + } + } + ] + } + } + } + }, + { + "name": "semantic-search-hybrid-bm25-and-knn-search", + "operation-type": "search", + "request-params": { + "search_pipeline": "nlp-min-max-arithmetic-search-pipeline" + }, + "variable-queries": {{variable_queries | default(0)}}, + "param-source": "hybrid-query-bm25-knn-search-source", + "body": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match": { + "title": "" + } + }, + { + "knn": { + "passage_embedding": { + "vector": "[1, 2, 3]", + "k": {{k | default(100)}} + } + } + } + ] + } + } + } + }, + { + "name": "semantic-search-hybrid-bm25-range-and-neural-search", + "operation-type": "search", + "request-params": { + "search_pipeline": "nlp-min-max-arithmetic-search-pipeline" + }, + "variable-queries": {{variable_queries | default(0)}}, + "param-source": "hybrid-query-bm25-neural-search-source", + "body": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match": { + "title": "" + } + }, + { + "neural": { + "passage_embedding": { + "query_text": "what types of rapid testing for Covid-19 have been developed?", + "model_id": "", + "k": {{k | default(10)}} + } + } + }, + { + "nested": { + "path": "metadata", + "query": { + "range": { + "metadata.pubmed_id": { + "gte": {{range_gte | default(100)}}, + "lte": {{range_lte | default(10000000)}} + } + } + } + } + } + ] + } + } + } + } diff --git a/trec_covid_semantic_search/params/params.json b/trec_covid_semantic_search/params/params.json new file mode 100644 index 00000000..310565b0 --- /dev/null +++ b/trec_covid_semantic_search/params/params.json @@ -0,0 +1,12 @@ +{ + "bulk_indexing_clients": 2, + "bulk_size": 100, + "number_of_replicas": 1, + "number_of_shards" :8, + "ingest_percentage":100, + "search_clients": 8, + "warmup_iterations": 20, + "iterations": 100, + "variable_queries": 50, + "k": 100 +} diff --git a/trec_covid_semantic_search/test_procedures/procedures.json b/trec_covid_semantic_search/test_procedures/procedures.json new file mode 100644 index 00000000..87b6a0c9 --- /dev/null +++ b/trec_covid_semantic_search/test_procedures/procedures.json @@ -0,0 +1,168 @@ +{ + "name": "create-index-ingest-data-search", + "description": "Indexes the whole document corpus using OpenSearch default settings. After that several query groups are run.", + "default": true, + "schedule": [ + { + "name": "cluster-settings", + "operation": { + "operation-type": "put-settings", + "body": { + "persistent": { + "plugins": { + "ml_commons": { + "only_run_on_ml_node": "false", + "native_memory_threshold": "100", + "allow_registering_model_via_local_file": "true", + "allow_registering_model_via_url": "true" + } + } + } + } + } + }, + { + "operation": "delete-index" + }, + { + "operation": "delete-ingest-pipeline" + }, + { + "operation": { + "operation-type": "delete-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" + } + }, + { + "operation": { + "operation-type": "register-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", + "model-version": "{{ model_version | default('1.0.1') }}", + "model-format": "{{ model_format | default('TORCH_SCRIPT') }}", + "model-config-file": "{{ model_config_file | default('') }}" + } + }, + { + "operation": "deploy-ml-model" + }, + { + "operation": "create-ingest-pipeline" + }, + { + "operation": { + "operation-type": "create-index", + "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { + "index.number_of_shards": {{number_of_shards | default(3)}}, + "index.number_of_replicas": {{number_of_replicas | default(0)}}, + "index.store.type": "{{store_type | default('fs')}}" + }{%- endif %} + } + }, + { + "name": "check-cluster-health-before-index-creation", + "operation": { + "operation-type": "cluster-health", + "index": "trec-covid", + "request-params": { + "wait_for_status": "{{cluster_health | default('green')}}", + "wait_for_no_relocating_shards": "true" + }, + "retry-until-success": true + } + }, + { + "operation": "index-append", + "warmup-time-period": 60, + "clients": {{bulk_indexing_clients | default(1)}}, + "ignore-response-error-level": "{{error_level | default('non-fatal')}}" + }, + { + "name": "refresh-after-index-created", + "operation": "refresh" + }, + { + "operation": { + "operation-type": "force-merge", + "request-timeout": 7200{%- if force_merge_max_num_segments is defined %}, + "max-num-segments": {{ force_merge_max_num_segments | tojson }} + {%- endif %} + } + }, + { + "name": "refresh-after-force-merge", + "operation": "refresh" + }, + { + "name": "wait-until-merges-finish", + "operation": { + "operation-type": "index-stats", + "index": "_all", + "condition": { + "path": "_all.total.merges.current", + "expected-value": 0 + }, + "retry-until-success": true, + "include-in-reporting": false + } + }, + { + "operation": "default", + "warmup-iterations": {{warmup_iterations | default(500) | tojson}}, + "iterations": {{iterations | default(500) | tojson }}, + "target-throughput": {{ target_throughput | default(100) | tojson}}, + "clients": {{ search_clients | default(1) }} + } + ] +}, +{ + "name": "search", + "description": "Run semantic search work.", + "default": false, + "schedule": [ + { + "operation": { + "operation-type": "delete-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" + } + }, + { + "operation": { + "operation-type": "register-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", + "model-version": "{{ model_version | default('1.0.1') }}", + "model-format": "{{ model_format | default('TORCH_SCRIPT') }}", + "model-config-file": "{{ model_config_file | default('') }}" + } + }, + { + "operation": "deploy-ml-model" + }, + { + "operation": "create-normalization-processor-no-weights-search-pipeline" + }, + { + "operation": "semantic-search-neural", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + }, + { + "operation": "semantic-search-hybrid-bm25-and-knn-search", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + }, + { + "operation": "semantic-search-hybrid-bm25-and-neural-search", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + }, + { + "operation": "semantic-search-hybrid-bm25-range-and-neural-search", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + } + ] +} \ No newline at end of file diff --git a/trec_covid_semantic_search/workload.json b/trec_covid_semantic_search/workload.json new file mode 100644 index 00000000..b8eedc27 --- /dev/null +++ b/trec_covid_semantic_search/workload.json @@ -0,0 +1,30 @@ +{% import "benchmark.helpers" as benchmark with context %} + +{ + "version": 2, + "description": "Benchmark performance of semantic search queries based on dataset of global daily weather measurements from NOAA", + "indices": [ + { + "name": "trec-covid", + "body": "index.json" + } + ], + "corpora": [ + { + "name": "trec-covid", + "base-url": "https://github.com/martin-gaievski/neural-search/releases/download/trec_covid_dataset_1M_v1", + "documents": [ + { + "source-file": "documents.json.zip", + "document-count": 1027950 + } + ] + } + ], + "operations": [ + {{ benchmark.collect(parts="operations/*.json") }} + ], + "test_procedures": [ + {{ benchmark.collect(parts="test_procedures/*.json") }} + ] +} diff --git a/trec_covid_semantic_search/workload.py b/trec_covid_semantic_search/workload.py new file mode 100644 index 00000000..073b8344 --- /dev/null +++ b/trec_covid_semantic_search/workload.py @@ -0,0 +1,186 @@ +import random +import os +import json +from pathlib import Path + +from osbenchmark.workload.loader import Downloader +from osbenchmark.workload.loader import Decompressor +from osbenchmark.workload.loader import Decompressor + +script_dir = os.path.dirname(os.path.realpath(__file__)) + +def ingest_pipeline_param_source(workload, params, **kwargs): + model_id = params['body']['processors'][0]['text_embedding']['model_id'] + if not model_id: + with open('model_id.json') as f: + d = json.loads(f.read()) + model_id = d['model_id'] + params['body']['processors'][0]['text_embedding']['model_id'] = model_id + return params + +class QueryParamSourceNeural: + def __init__(self, workload, params, **kwargs): + if len(workload.indices) == 1: + index = workload.indices[0].name + if len(workload.indices[0].types) == 1: + type = workload.indices[0].types[0].name + else: + type = None + else: + index = "_all" + type = None + + self._params = params + self._params['index'] = index + self._params['type'] = type + self._params['variable-queries'] = params.get("variable-queries", 0) + self.infinite = True + + if self._params['variable-queries'] > 0: + with open(script_dir + os.sep + 'workload_queries_knn.json', 'r') as f: + d = json.loads(f.read()) + source_file = d['source-file'] + base_url = d['base-url'] + compressed_bytes = d['compressed-bytes'] + uncompressed_bytes = d['uncompressed-bytes'] + compressed_path = script_dir + os.sep + source_file + uncompressed_path = script_dir + os.sep + Path(source_file).stem + if not os.path.exists(compressed_path): + downloader = Downloader(False, False) + downloader.download(base_url, None, compressed_path, compressed_bytes) + if not os.path.exists(uncompressed_path): + decompressor = Decompressor() + decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes) + + def partition(self, partition_index, total_partitions): + return self + + def params(self): + params = self._params + with open('model_id.json', 'r') as f: + d = json.loads(f.read()) + params['body']['query']['neural']['passage_embedding']['model_id'] = d['model_id'] + count = self._params.get("variable-queries", 0) + if count > 0: + script_dir = os.path.dirname(os.path.realpath(__file__)) + with open(script_dir + '/queries.json', 'r') as f: + lines = f.read().splitlines() + line =random.choice(lines) + query_text = json.loads(line)['query'] + params['body']['query']['neural']['passage_embedding']['query_text'] = query_text + + return params + +class QueryParamSourceHybridBm25Knn: + def __init__(self, workload, params, **kwargs): + if len(workload.indices) == 1: + index = workload.indices[0].name + if len(workload.indices[0].types) == 1: + type = workload.indices[0].types[0].name + else: + type = None + else: + index = "_all" + type = None + + self._params = params + self._params['index'] = index + self._params['type'] = type + self._params['variable-queries'] = params.get("variable-queries", 0) + self.infinite = True + + if self._params['variable-queries'] > 0: + with open(script_dir + os.sep + 'workload_queries_knn.json', 'r') as f: + d = json.loads(f.read()) + source_file = d['source-file'] + base_url = d['base-url'] + compressed_bytes = d['compressed-bytes'] + uncompressed_bytes = d['uncompressed-bytes'] + compressed_path = script_dir + os.sep + source_file + uncompressed_path = script_dir + os.sep + Path(source_file).stem + if not os.path.exists(compressed_path): + downloader = Downloader(False, False) + downloader.download(base_url, None, compressed_path, compressed_bytes) + if not os.path.exists(uncompressed_path): + decompressor = Decompressor() + decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes) + + def partition(self, partition_index, total_partitions): + return self + + def params(self): + params = self._params + count = self._params.get("variable-queries", 0) + if count > 0: + script_dir = os.path.dirname(os.path.realpath(__file__)) + with open(script_dir + '/queries.json', 'r') as f: + lines = f.read().splitlines() + line =random.choice(lines) + query_text = json.loads(line)['query'] + match_query = random.choice(query_text.split()).lower() + query_vector = json.loads(line)['vector_embedding'] + params['body']['query']['hybrid']['queries'][0]['match']['title'] = match_query + params['body']['query']['hybrid']['queries'][1]['knn']['passage_embedding']['vector'] = query_vector + return params + +class QueryParamSourceHybridBm25Neural: + def __init__(self, workload, params, **kwargs): + if len(workload.indices) == 1: + index = workload.indices[0].name + if len(workload.indices[0].types) == 1: + type = workload.indices[0].types[0].name + else: + type = None + else: + index = "_all" + type = None + + self._params = params + self._params['index'] = index + self._params['type'] = type + self._params['variable-queries'] = params.get("variable-queries", 0) + self.infinite = True + + if self._params['variable-queries'] > 0: + with open(script_dir + os.sep + 'workload_queries_knn.json', 'r') as f: + d = json.loads(f.read()) + source_file = d['source-file'] + base_url = d['base-url'] + compressed_bytes = d['compressed-bytes'] + uncompressed_bytes = d['uncompressed-bytes'] + compressed_path = script_dir + os.sep + source_file + uncompressed_path = script_dir + os.sep + Path(source_file).stem + if not os.path.exists(compressed_path): + downloader = Downloader(False, False) + downloader.download(base_url, None, compressed_path, compressed_bytes) + if not os.path.exists(uncompressed_path): + decompressor = Decompressor() + decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes) + + def partition(self, partition_index, total_partitions): + return self + + def params(self): + params = self._params + count = self._params.get("variable-queries", 0) + if count > 0: + script_dir = os.path.dirname(os.path.realpath(__file__)) + model_id = '' + with open('model_id.json', 'r') as f: + d = json.loads(f.read()) + model_id = d['model_id'] + with open(script_dir + '/queries.json', 'r') as f: + lines = f.read().splitlines() + line =random.choice(lines) + query_text = json.loads(line)['query'] + match_query = random.choice(query_text.split()).lower() + params['body']['query']['hybrid']['queries'][0]['match']['title'] = match_query + params['body']['query']['hybrid']['queries'][1]['neural']['passage_embedding']['model_id'] = model_id + params['body']['query']['hybrid']['queries'][1]['neural']['passage_embedding']['query_text'] = query_text + return params + +def register(registry): + registry.register_param_source("semantic-search-neural-source", QueryParamSourceNeural) + registry.register_param_source("hybrid-query-bm25-neural-search-source", QueryParamSourceHybridBm25Neural) + registry.register_param_source("hybrid-query-bm25-knn-search-source", QueryParamSourceHybridBm25Knn) + registry.register_param_source("create-ingest-pipeline", ingest_pipeline_param_source) \ No newline at end of file diff --git a/trec_covid_semantic_search/workload_queries_knn.json b/trec_covid_semantic_search/workload_queries_knn.json new file mode 100644 index 00000000..b54253c9 --- /dev/null +++ b/trec_covid_semantic_search/workload_queries_knn.json @@ -0,0 +1,6 @@ +{ + "base-url": "https://github.com/martin-gaievski/neural-search/releases/download/trec_covid_queries_knn", + "source-file": "queries.json.zip", + "compressed-bytes" : 193612, + "uncompressed-bytes": 519763 +} From bad64e29e2fcf2370e8778a385d9cc5e64b30f0e Mon Sep 17 00:00:00 2001 From: Martin Gaievski Date: Fri, 19 Jul 2024 16:36:37 +0000 Subject: [PATCH 2/4] Refactor code, add param files Signed-off-by: Martin Gaievski --- .../params/params_no_ml_node.json | 13 ++++ .../{params.json => params_only_ml_node.json} | 7 +- .../common/redeploy_local_model.json | 18 +++++ .../common/semantic-search-queries.json | 24 +++++++ .../test_procedures/procedures.json | 71 +++---------------- 5 files changed, 68 insertions(+), 65 deletions(-) create mode 100644 trec_covid_semantic_search/params/params_no_ml_node.json rename trec_covid_semantic_search/params/{params.json => params_only_ml_node.json} (65%) create mode 100644 trec_covid_semantic_search/test_procedures/common/redeploy_local_model.json create mode 100644 trec_covid_semantic_search/test_procedures/common/semantic-search-queries.json diff --git a/trec_covid_semantic_search/params/params_no_ml_node.json b/trec_covid_semantic_search/params/params_no_ml_node.json new file mode 100644 index 00000000..2f81a979 --- /dev/null +++ b/trec_covid_semantic_search/params/params_no_ml_node.json @@ -0,0 +1,13 @@ +{ + "bulk_indexing_clients": 4, + "bulk_size": 200, + "number_of_replicas": 1, + "number_of_shards" :8, + "ingest_percentage":100, + "search_clients": 8, + "warmup_iterations": 20, + "iterations": 100, + "variable_queries": 50, + "k": 100, + "only_run_on_ml_node" : "false" +} diff --git a/trec_covid_semantic_search/params/params.json b/trec_covid_semantic_search/params/params_only_ml_node.json similarity index 65% rename from trec_covid_semantic_search/params/params.json rename to trec_covid_semantic_search/params/params_only_ml_node.json index 310565b0..becb0209 100644 --- a/trec_covid_semantic_search/params/params.json +++ b/trec_covid_semantic_search/params/params_only_ml_node.json @@ -1,6 +1,6 @@ { - "bulk_indexing_clients": 2, - "bulk_size": 100, + "bulk_indexing_clients": 4, + "bulk_size": 200, "number_of_replicas": 1, "number_of_shards" :8, "ingest_percentage":100, @@ -8,5 +8,6 @@ "warmup_iterations": 20, "iterations": 100, "variable_queries": 50, - "k": 100 + "k": 100, + "only_run_on_ml_node" : "true" } diff --git a/trec_covid_semantic_search/test_procedures/common/redeploy_local_model.json b/trec_covid_semantic_search/test_procedures/common/redeploy_local_model.json new file mode 100644 index 00000000..960318b8 --- /dev/null +++ b/trec_covid_semantic_search/test_procedures/common/redeploy_local_model.json @@ -0,0 +1,18 @@ +{ + "operation": { + "operation-type": "delete-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" + } + }, + { + "operation": { + "operation-type": "register-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", + "model-version": "{{ model_version | default('1.0.1') }}", + "model-format": "{{ model_format | default('TORCH_SCRIPT') }}", + "model-config-file": "{{ model_config_file | default('') }}" + } + }, + { + "operation": "deploy-ml-model" + } \ No newline at end of file diff --git a/trec_covid_semantic_search/test_procedures/common/semantic-search-queries.json b/trec_covid_semantic_search/test_procedures/common/semantic-search-queries.json new file mode 100644 index 00000000..1df4f4ba --- /dev/null +++ b/trec_covid_semantic_search/test_procedures/common/semantic-search-queries.json @@ -0,0 +1,24 @@ +{ + "operation": "semantic-search-neural", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + }, + { + "operation": "semantic-search-hybrid-bm25-and-knn-search", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + }, + { + "operation": "semantic-search-hybrid-bm25-and-neural-search", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + }, + { + "operation": "semantic-search-hybrid-bm25-range-and-neural-search", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + } \ No newline at end of file diff --git a/trec_covid_semantic_search/test_procedures/procedures.json b/trec_covid_semantic_search/test_procedures/procedures.json index 87b6a0c9..1a9a58bf 100644 --- a/trec_covid_semantic_search/test_procedures/procedures.json +++ b/trec_covid_semantic_search/test_procedures/procedures.json @@ -11,7 +11,7 @@ "persistent": { "plugins": { "ml_commons": { - "only_run_on_ml_node": "false", + "only_run_on_ml_node": "{{only_run_on_ml_node | default('false')}}", "native_memory_threshold": "100", "allow_registering_model_via_local_file": "true", "allow_registering_model_via_url": "true" @@ -27,24 +27,7 @@ { "operation": "delete-ingest-pipeline" }, - { - "operation": { - "operation-type": "delete-ml-model", - "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" - } - }, - { - "operation": { - "operation-type": "register-ml-model", - "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", - "model-version": "{{ model_version | default('1.0.1') }}", - "model-format": "{{ model_format | default('TORCH_SCRIPT') }}", - "model-config-file": "{{ model_config_file | default('') }}" - } - }, - { - "operation": "deploy-ml-model" - }, + {{ benchmark.collect(parts="common/redeploy_local_model.json") }}, { "operation": "create-ingest-pipeline" }, @@ -111,7 +94,11 @@ "iterations": {{iterations | default(500) | tojson }}, "target-throughput": {{ target_throughput | default(100) | tojson}}, "clients": {{ search_clients | default(1) }} - } + }, + { + "operation": "create-normalization-processor-no-weights-search-pipeline" + }, + {{ benchmark.collect(parts="common/semantic-search-queries.json") }} ] }, { @@ -119,50 +106,10 @@ "description": "Run semantic search work.", "default": false, "schedule": [ - { - "operation": { - "operation-type": "delete-ml-model", - "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" - } - }, - { - "operation": { - "operation-type": "register-ml-model", - "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", - "model-version": "{{ model_version | default('1.0.1') }}", - "model-format": "{{ model_format | default('TORCH_SCRIPT') }}", - "model-config-file": "{{ model_config_file | default('') }}" - } - }, - { - "operation": "deploy-ml-model" - }, + {{ benchmark.collect(parts="common/redeploy_local_model.json") }}, { "operation": "create-normalization-processor-no-weights-search-pipeline" }, - { - "operation": "semantic-search-neural", - "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, - "iterations": {{iterations | default(100) | tojson }}, - "clients": {{ search_clients | default(1)}} - }, - { - "operation": "semantic-search-hybrid-bm25-and-knn-search", - "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, - "iterations": {{iterations | default(100) | tojson }}, - "clients": {{ search_clients | default(1)}} - }, - { - "operation": "semantic-search-hybrid-bm25-and-neural-search", - "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, - "iterations": {{iterations | default(100) | tojson }}, - "clients": {{ search_clients | default(1)}} - }, - { - "operation": "semantic-search-hybrid-bm25-range-and-neural-search", - "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, - "iterations": {{iterations | default(100) | tojson }}, - "clients": {{ search_clients | default(1)}} - } + {{ benchmark.collect(parts="common/semantic-search-queries.json") }} ] } \ No newline at end of file From 079b4b4bb55f67ce5895a686c729a98b3c47908d Mon Sep 17 00:00:00 2001 From: Martin Gaievski Date: Tue, 30 Jul 2024 18:50:55 +0000 Subject: [PATCH 3/4] Added custom runner for setting concurrent segment search params Signed-off-by: Martin Gaievski --- trec_covid_semantic_search/README.md | 2 + ...rams_no_ml_node_concurrent_seg_search.json | 14 +++++++ ...ms_only_ml_node_concurrent_seg_search.json | 14 +++++++ trec_covid_semantic_search/runners.py | 38 +++++++++++++++++++ .../test_procedures/procedures.json | 10 ++++- trec_covid_semantic_search/workload.py | 6 ++- 6 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 trec_covid_semantic_search/params/params_no_ml_node_concurrent_seg_search.json create mode 100644 trec_covid_semantic_search/params/params_only_ml_node_concurrent_seg_search.json create mode 100644 trec_covid_semantic_search/runners.py diff --git a/trec_covid_semantic_search/README.md b/trec_covid_semantic_search/README.md index 6374cdf6..6567d4fa 100644 --- a/trec_covid_semantic_search/README.md +++ b/trec_covid_semantic_search/README.md @@ -71,6 +71,8 @@ This workload allows the following parameters to be specified using `--workload- * `num_variable_queries` (default: 0) Number of variable queries will be used for the semantic search task, 0 means fixed query and max value is 50. * `range_gte` (default: 100) Number that defines the lower bound (inclusive) for range query when it's used as elemnts in semantic search query * `range_lte` (default: 10000000) Number that defines the upper bound (inclusive) for range query when it's used as elemnts in semantic search query +* `concurent_segment_search_enabled` (default: `false`) Enables or disables concurrent segment search feature +* `max_slice_count` (default: 0) Set the maximum number of slices for concurrent segment search feature. 0 means we use Lucene meachnism of calculating the number of slices ### Running a benchmark diff --git a/trec_covid_semantic_search/params/params_no_ml_node_concurrent_seg_search.json b/trec_covid_semantic_search/params/params_no_ml_node_concurrent_seg_search.json new file mode 100644 index 00000000..bc47d9d3 --- /dev/null +++ b/trec_covid_semantic_search/params/params_no_ml_node_concurrent_seg_search.json @@ -0,0 +1,14 @@ +{ + "bulk_indexing_clients": 4, + "bulk_size": 200, + "number_of_replicas": 1, + "number_of_shards" :8, + "ingest_percentage":100, + "search_clients": 8, + "warmup_iterations": 20, + "iterations": 100, + "variable_queries": 50, + "k": 100, + "only_run_on_ml_node" : "false", + "concurent_segment_search_enabled": "true" +} diff --git a/trec_covid_semantic_search/params/params_only_ml_node_concurrent_seg_search.json b/trec_covid_semantic_search/params/params_only_ml_node_concurrent_seg_search.json new file mode 100644 index 00000000..5ea45814 --- /dev/null +++ b/trec_covid_semantic_search/params/params_only_ml_node_concurrent_seg_search.json @@ -0,0 +1,14 @@ +{ + "bulk_indexing_clients": 4, + "bulk_size": 200, + "number_of_replicas": 1, + "number_of_shards" :8, + "ingest_percentage":100, + "search_clients": 8, + "warmup_iterations": 20, + "iterations": 100, + "variable_queries": 50, + "k": 100, + "only_run_on_ml_node" : "true", + "concurent_segment_search_enabled": "true" +} diff --git a/trec_covid_semantic_search/runners.py b/trec_covid_semantic_search/runners.py new file mode 100644 index 00000000..37ce4376 --- /dev/null +++ b/trec_covid_semantic_search/runners.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# The OpenSearch Contributors require contributions made to +# this file be licensed under the Apache-2.0 license or a +# compatible open source license. + +from osbenchmark.worker_coordinator.runner import Retry, Runner +from osbenchmark.client import RequestContextHolder + +# This runner class and registration is a temporary workaround while the next version of OSB is pending release +def register(registry): + registry.register_runner( + UpdateConcurrentSegmentSearchSettings.RUNNER_NAME, + Retry(UpdateConcurrentSegmentSearchSettings()), async_runner=True + ) + +request_context_holder = RequestContextHolder() + +class UpdateConcurrentSegmentSearchSettings(Runner): + + RUNNER_NAME = "update-concurrent-segment-search-settings" + + async def __call__(self, opensearch, params): + enable_setting = params.get("enable", "false") + max_slice_count = params.get("max_slice_count", None) + body = { + "persistent": { + "search.concurrent_segment_search.enabled": enable_setting + } + } + if max_slice_count is not None: + body["persistent"]["search.concurrent.max_slice_count"] = max_slice_count + request_context_holder.on_client_request_start() + await opensearch.cluster.put_settings(body=body) + request_context_holder.on_client_request_end() + + def __repr__(self, *args, **kwargs): + return self.RUNNER_NAME diff --git a/trec_covid_semantic_search/test_procedures/procedures.json b/trec_covid_semantic_search/test_procedures/procedures.json index 1a9a58bf..9ca2ddc7 100644 --- a/trec_covid_semantic_search/test_procedures/procedures.json +++ b/trec_covid_semantic_search/test_procedures/procedures.json @@ -22,7 +22,15 @@ } }, { - "operation": "delete-index" + "operation": "delete-index" + }, + { + "name": "set-concurrent-segment-search", + "operation": { + "operation-type": "update-concurrent-segment-search-settings", + "enabled": "{{concurent_segment_search_enabled | default('false')}}", + "max_slice_count": "{{max_slice_count | default(0)}}" + } }, { "operation": "delete-ingest-pipeline" diff --git a/trec_covid_semantic_search/workload.py b/trec_covid_semantic_search/workload.py index 073b8344..20408125 100644 --- a/trec_covid_semantic_search/workload.py +++ b/trec_covid_semantic_search/workload.py @@ -6,6 +6,8 @@ from osbenchmark.workload.loader import Downloader from osbenchmark.workload.loader import Decompressor from osbenchmark.workload.loader import Decompressor +from osbenchmark.worker_coordinator.runner import Retry +from .runners import register as register_runners script_dir = os.path.dirname(os.path.realpath(__file__)) @@ -183,4 +185,6 @@ def register(registry): registry.register_param_source("semantic-search-neural-source", QueryParamSourceNeural) registry.register_param_source("hybrid-query-bm25-neural-search-source", QueryParamSourceHybridBm25Neural) registry.register_param_source("hybrid-query-bm25-knn-search-source", QueryParamSourceHybridBm25Knn) - registry.register_param_source("create-ingest-pipeline", ingest_pipeline_param_source) \ No newline at end of file + # This runner class and registration is a temporary workaround while the next version of OSB is pending release + registry.register_param_source("create-ingest-pipeline", ingest_pipeline_param_source) + register_runners(registry) \ No newline at end of file From 79bc1f31314eab16486a70cb964c4db486a5b669 Mon Sep 17 00:00:00 2001 From: Martin Gaievski Date: Mon, 9 Sep 2024 23:02:34 +0000 Subject: [PATCH 4/4] Working version Signed-off-by: Martin Gaievski --- treccovid_semantic_search/index-1M.json | 45 ++++++ .../operations/default.json | 78 +++++++++++ treccovid_semantic_search/params.json | 13 ++ .../test_procedures/default.json | 130 +++++++++++++++++- treccovid_semantic_search/workload.json | 28 +++- treccovid_semantic_search/workload.py | 111 +++++++++++++++ 6 files changed, 402 insertions(+), 3 deletions(-) create mode 100644 treccovid_semantic_search/index-1M.json create mode 100644 treccovid_semantic_search/params.json diff --git a/treccovid_semantic_search/index-1M.json b/treccovid_semantic_search/index-1M.json new file mode 100644 index 00000000..e1489ebb --- /dev/null +++ b/treccovid_semantic_search/index-1M.json @@ -0,0 +1,45 @@ +{ + "settings": { + "index.number_of_shards": {{number_of_shards | default(1)}}, + "index.number_of_replicas": {{number_of_replicas | default(0)}}, + "index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}}, + "index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}}, + "index.merge.policy.max_merged_segment": "100GB", + "index.knn": true, + "default_pipeline": "nlp-ingest-pipeline" + }, + "mappings": { + "dynamic": "true", + "_source": { + "enabled": {{ source_enabled | default(true) | tojson }} + }, + "properties": { + "title": { + "type": "text" + }, + "text": { + "type": "text" + }, + "metadata": { + "type": "nested", + "properties": { + "url": { + "type": "text" + }, + "pubmed_id": { + "type": "integer" + } + } + }, + "passage_embedding": { + "type": "knn_vector", + "dimension": 768, + "method": { + "name": "hnsw", + "space_type": "innerproduct", + "engine": "faiss" + } + } + } + } +} diff --git a/treccovid_semantic_search/operations/default.json b/treccovid_semantic_search/operations/default.json index ac3a65e7..0dbf85cf 100644 --- a/treccovid_semantic_search/operations/default.json +++ b/treccovid_semantic_search/operations/default.json @@ -48,6 +48,26 @@ } } }, + { + "name": "create-normalization-processor-search-pipeline", + "operation-type": "create-search-pipeline", + "id": "nlp-normalization-search-pipeline", + "body": { + "description": "Post processor for hybrid search with min_max normalization and arithmetic_mean combination", + "phase_results_processors": [ + { + "normalization-processor": { + "normalization": { + "technique": "min_max" + }, + "combination": { + "technique": "arithmetic_mean" + } + } + } + ] + } + }, { "name": "semantic-search", "operation-type": "search", @@ -69,4 +89,62 @@ } } } + }, + { + "name": "semantic-search-neural", + "operation-type": "search", + "variable-queries": {{variable_queries | default(0)}}, + "param-source": "semantic-search-neural-source", + "body": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "neural": { + "passage_embedding": { + "query_text": "what types of rapid testing for Covid-19 have been developed?", + "model_id": "", + "k": {{k | default(10)}} + } + } + } + } + }, + { + "name": "hybrid-search-bm25-neural", + "operation-type": "search", + "request-params": { + "search_pipeline": "nlp-normalization-search-pipeline" + }, + "variable-queries": {{variable_queries | default(0)}}, + "param-source": "hybrid-query-bm25-neural-search-source", + "body": { + "_source": { + "excludes": [ + "passage_embedding" + ] + }, + "query": { + "hybrid": { + "queries": [ + { + "match": { + "title": "" + } + }, + { + "neural": { + "passage_embedding": { + "query_text": "what types of rapid testing for Covid-19 have been developed?", + "model_id": "", + "k": {{k | default(10)}} + } + } + } + ] + } + } + } } diff --git a/treccovid_semantic_search/params.json b/treccovid_semantic_search/params.json new file mode 100644 index 00000000..7f17ce4f --- /dev/null +++ b/treccovid_semantic_search/params.json @@ -0,0 +1,13 @@ +{ + "bulk_indexing_clients": 4, + "bulk_size": 200, + "number_of_replicas": 1, + "number_of_shards" :8, + "ingest_percentage":2, + "search_clients": 8, + "warmup_iterations": 20, + "iterations": 100, + "k": 100, + "corpus_size": "1M", + "variable_queries": 10 +} diff --git a/treccovid_semantic_search/test_procedures/default.json b/treccovid_semantic_search/test_procedures/default.json index 12c1f675..d80fb0d7 100644 --- a/treccovid_semantic_search/test_procedures/default.json +++ b/treccovid_semantic_search/test_procedures/default.json @@ -12,7 +12,6 @@ "plugins": { "ml_commons": { "only_run_on_ml_node": "false", - "native_memory_threshold": "99", "allow_registering_model_via_local_file": "true", "allow_registering_model_via_url": "true" } @@ -109,4 +108,133 @@ "clients": {{ search_clients | default(1)}} } ] + }, + { + "name": "index-merge-search-vector", + "description": "Indexes the corpus with vector embedding and then runs queries with vector embedding.", + "default": false, + "schedule": [ + { + "name": "cluster-settings", + "operation": { + "operation-type": "put-settings", + "body": { + "persistent": { + "plugins": { + "ml_commons": { + "only_run_on_ml_node": "false", + "native_memory_threshold": "99", + "allow_registering_model_via_local_file": "true", + "allow_registering_model_via_url": "true" + } + } + } + } + } + }, + { + "operation": "delete-index" + }, + { + "operation": "delete-ingest-pipeline" + }, + { + "operation": { + "operation-type": "delete-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" + } + }, + { + "operation": { + "operation-type": "register-ml-model", + "model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", + "model-version": "{{ model_version | default('1.0.1') }}", + "model-format": "{{ model_format | default('TORCH_SCRIPT') }}", + "model-config-file": "{{ model_config_file | default('') }}" + } + }, + { + "operation": "deploy-ml-model" + }, + { + "operation": "create-ingest-pipeline" + }, + { + "operation": { + "operation-type": "create-index", + "settings": {%- if index_settings is defined %} {{index_settings | tojson}} {%- else %} { + "index.refresh_interval": "5s", + "index.translog.flush_threshold_size": "1g" + }{%- endif %} + } + }, + { + "name": "check-cluster-health", + "operation": { + "operation-type": "cluster-health", + "index": "treccovid1m", + "request-params": { + "wait_for_status": "{{cluster_health | default('green')}}", + "wait_for_no_relocating_shards": "true" + }, + "retry-until-success": true + } + }, + { + "operation": "index-append", + "warmup-time-period": 60, + "clients": {{bulk_indexing_clients | default(1)}}, + "ignore-response-error-level": "{{error_level | default('non-fatal')}}" + }, + { + "name": "refresh-after-index", + "operation": "refresh" + }, + { + "operation": { + "operation-type": "force-merge", + "request-timeout": 7200{%- if force_merge_max_num_segments is defined %}, + "max-num-segments": {{ force_merge_max_num_segments | tojson }} + {%- endif %} + } + }, + { + "name": "refresh-after-force-merge", + "operation": "refresh" + }, + { + "operation": "wait-until-merges-finish" + } + ] + }, + { + "name": "run-search-semantic", + "description": "Runs search workload for experiment 3: neural query search", + "default": false, + "schedule": [ + { + "name": "check-cluster-health-before-index-creation", + "operation": { + "operation-type": "cluster-health", + "index": "treccovid1m", + "request-params": { + "wait_for_status": "{{cluster_health | default('green')}}", + "wait_for_no_relocating_shards": "true" + }, + "retry-until-success": true + } + }, + { + "operation": "semantic-search-neural", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + }, + { + "operation": "hybrid-search-bm25-neural", + "warmup-iterations": {{warmup_iterations | default(50) | tojson}}, + "iterations": {{iterations | default(100) | tojson }}, + "clients": {{ search_clients | default(1)}} + } + ] } diff --git a/treccovid_semantic_search/workload.json b/treccovid_semantic_search/workload.json index 761d1d0e..92cedd57 100644 --- a/treccovid_semantic_search/workload.json +++ b/treccovid_semantic_search/workload.json @@ -1,15 +1,25 @@ {% import "benchmark.helpers" as benchmark with context %} - { "version": 2, "description": "Trec-Covid is a dataset collection of documents about COVID-19 information.", "indices": [ + {% if not corpus_size %} + {% set corpus_size = '100' %} + {% endif %} + {% if corpus_size == '100' %} { "name": "treccovid", "body": "index.json" } + {% elif corpus_size == '1M' %} + { + "name": "treccovid1m", + "body": "index-1M.json" + } + {% endif %} ], "corpora": [ + {% if corpus_size == '100' %} { "name": "treccovid", "base-url": "https://dbyiw3u3rf9yr.cloudfront.net/corpora/treccovid", @@ -18,10 +28,24 @@ "source-file": "documents.json.bz2", "document-count": 129192, "compressed-bytes": 51187469, - "uncompressed-bytes": 211980208 + "uncompressed-bytes": 211980208, + "target-index": "treccovid" + } + ] + } + {% elif corpus_size == '1M' %} + { + "name": "treccovid1m", + "base-url": "https://github.com/martin-gaievski/neural-search/releases/download/trec_covid_dataset_1m_v2", + "documents": [ + { + "source-file": "documents.json.zip", + "document-count": 1027986, + "target-index": "treccovid1m" } ] } + {% endif %} ], "operations": [ {{ benchmark.collect(parts="operations/*.json") }} diff --git a/treccovid_semantic_search/workload.py b/treccovid_semantic_search/workload.py index 1eaa0436..50712aea 100644 --- a/treccovid_semantic_search/workload.py +++ b/treccovid_semantic_search/workload.py @@ -70,6 +70,117 @@ def params(self): params['body']['query']['neural']['passage_embedding']['query_text'] = query_text return params +class QueryParamSourceNeural: + def __init__(self, workload, params, **kwargs): + if len(workload.indices) == 1: + index = workload.indices[0].name + if len(workload.indices[0].types) == 1: + type = workload.indices[0].types[0].name + else: + type = None + else: + index = "_all" + type = None + + self._params = params + self._params['index'] = index + self._params['type'] = type + self._params['variable-queries'] = params.get("variable-queries", 0) + self.infinite = True + + if self._params['variable-queries'] > 0: + with open(script_dir + os.sep + 'workload_queries.json', 'r') as f: + d = json.loads(f.read()) + source_file = d['source-file'] + base_url = d['base-url'] + compressed_bytes = d['compressed-bytes'] + uncompressed_bytes = d['uncompressed-bytes'] + compressed_path = script_dir + os.sep + source_file + uncompressed_path = script_dir + os.sep + Path(source_file).stem + if not os.path.exists(compressed_path): + downloader = Downloader(False, False) + downloader.download(base_url, None, compressed_path, compressed_bytes) + if not os.path.exists(uncompressed_path): + decompressor = Decompressor() + decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes) + + def partition(self, partition_index, total_partitions): + return self + + def params(self): + params = self._params + with open('model_id.json', 'r') as f: + d = json.loads(f.read()) + params['body']['query']['neural']['passage_embedding']['model_id'] = d['model_id'] + count = self._params.get("variable-queries", 0) + if count > 0: + script_dir = os.path.dirname(os.path.realpath(__file__)) + with open(script_dir + '/queries.json', 'r') as f: + lines = f.read().splitlines() + line =random.choice(lines) + query_text = json.loads(line)['text'] + params['body']['query']['neural']['passage_embedding']['query_text'] = query_text + + return params + +class QueryParamSourceHybridBm25Neural: + def __init__(self, workload, params, **kwargs): + if len(workload.indices) == 1: + index = workload.indices[0].name + if len(workload.indices[0].types) == 1: + type = workload.indices[0].types[0].name + else: + type = None + else: + index = "_all" + type = None + + self._params = params + self._params['index'] = index + self._params['type'] = type + self._params['variable-queries'] = params.get("variable-queries", 0) + self.infinite = True + + if self._params['variable-queries'] > 0: + with open(script_dir + os.sep + 'workload_queries.json', 'r') as f: + d = json.loads(f.read()) + source_file = d['source-file'] + base_url = d['base-url'] + compressed_bytes = d['compressed-bytes'] + uncompressed_bytes = d['uncompressed-bytes'] + compressed_path = script_dir + os.sep + source_file + uncompressed_path = script_dir + os.sep + Path(source_file).stem + if not os.path.exists(compressed_path): + downloader = Downloader(False, False) + downloader.download(base_url, None, compressed_path, compressed_bytes) + if not os.path.exists(uncompressed_path): + decompressor = Decompressor() + decompressor.decompress(compressed_path, uncompressed_path, uncompressed_bytes) + + def partition(self, partition_index, total_partitions): + return self + + def params(self): + params = self._params + count = self._params.get("variable-queries", 0) + if count > 0: + script_dir = os.path.dirname(os.path.realpath(__file__)) + model_id = '' + with open('model_id.json', 'r') as f: + d = json.loads(f.read()) + model_id = d['model_id'] + with open(script_dir + '/queries.json', 'r') as f: + lines = f.read().splitlines() + line =random.choice(lines) + query_text = json.loads(line)['text'] + match_query = random.choice(query_text.split()).lower() + params['body']['query']['hybrid']['queries'][0]['match']['title'] = match_query + params['body']['query']['hybrid']['queries'][1]['neural']['passage_embedding']['model_id'] = model_id + params['body']['query']['hybrid']['queries'][1]['neural']['passage_embedding']['query_text'] = query_text + return params + def register(registry): registry.register_param_source("semantic-search-source", QueryParamSource) + registry.register_param_source("semantic-search-neural-source", QueryParamSourceNeural) + registry.register_param_source("hybrid-query-bm25-neural-search-source", QueryParamSourceHybridBm25Neural) registry.register_param_source("create-ingest-pipeline", ingest_pipeline_param_source)