-
Notifications
You must be signed in to change notification settings - Fork 75
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Adding semantic search workload that includes vector and bm25 search #342
base: main
Are you sure you want to change the base?
Changes from all commits
203fb92
bad64e2
079b4b4
79bc1f3
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
{ | ||
"settings": { | ||
"index.number_of_shards": {{number_of_shards | default(1)}}, | ||
"index.number_of_replicas": {{number_of_replicas | default(0)}}, | ||
"index.queries.cache.enabled": {{query_cache_enabled | default(false) | tojson}}, | ||
"index.requests.cache.enable": {{requests_cache_enabled | default(false) | tojson}}, | ||
"index.merge.policy.max_merged_segment": "100GB", | ||
"index.knn": true, | ||
"default_pipeline": "nlp-ingest-pipeline" | ||
}, | ||
"mappings": { | ||
"dynamic": "true", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. any reason to allow dynamic? why not strict? |
||
"_source": { | ||
"enabled": {{ source_enabled | default(true) | tojson }} | ||
}, | ||
"properties": { | ||
"title": { | ||
"type": "text" | ||
}, | ||
"metadata": { | ||
"type": "nested", | ||
"properties": { | ||
"url": { | ||
"type": "text" | ||
}, | ||
"pubmed_id": { | ||
"type": "integer" | ||
} | ||
} | ||
}, | ||
"passage_embedding": { | ||
"type": "knn_vector", | ||
"dimension": 768, | ||
"method": { | ||
"name": "hnsw", | ||
"space_type": "innerproduct", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this should be outside method. |
||
"engine": "faiss", | ||
"parameters": { | ||
"ef_construction": 256, | ||
"m": 256 | ||
} | ||
} | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,207 @@ | ||
{ | ||
"name": "index", | ||
"operation-type": "bulk", | ||
"bulk-size": {{bulk_size | default(100)}}, | ||
"ingest-percentage": {{ingest_percentage | default(100)}} | ||
}, | ||
{ | ||
"name": "delete-ingest-pipeline", | ||
"operation-type": "delete-pipeline", | ||
"id": "nlp-ingest-pipeline" | ||
}, | ||
{ | ||
"name": "create-ingest-pipeline", | ||
"operation-type": "put-pipeline", | ||
"param-source": "create-ingest-pipeline", | ||
"id": "nlp-ingest-pipeline", | ||
"body": { | ||
"description": "An NLP ingest pipeline", | ||
"processors": [ | ||
{ | ||
"text_embedding": { | ||
"model_id": "", | ||
"field_map": { | ||
"title": "passage_embedding" | ||
} | ||
} | ||
} | ||
] | ||
} | ||
}, | ||
{ | ||
"name": "index-append", | ||
"operation-type": "bulk", | ||
"bulk-size": {{bulk_size | default(100)}}, | ||
"ingest-percentage": {{ingest_percentage | default(100)}} | ||
}, | ||
{ | ||
"name": "default", | ||
"operation-type": "search", | ||
"body": { | ||
"query": { | ||
"match_all": {} | ||
} | ||
} | ||
}, | ||
{ | ||
"name": "semantic-search-neural", | ||
"operation-type": "search", | ||
"variable-queries": {{variable_queries | default(0)}}, | ||
"param-source": "semantic-search-neural-source", | ||
"body": { | ||
"_source": { | ||
"excludes": [ | ||
"passage_embedding" | ||
] | ||
}, | ||
"query": { | ||
"neural": { | ||
"passage_embedding": { | ||
"query_text": "what types of rapid testing for Covid-19 have been developed?", | ||
"model_id": "", | ||
"k": {{k | default(10)}} | ||
} | ||
} | ||
} | ||
} | ||
}, | ||
{ | ||
"name": "create-normalization-processor-no-weights-search-pipeline", | ||
"operation-type": "create-search-pipeline", | ||
"id": "nlp-min-max-arithmetic-search-pipeline", | ||
"body": { | ||
"description": "Post processor for hybrid search with min_max normalization and arithmetic_mean combination", | ||
"phase_results_processors": [ | ||
{ | ||
"normalization-processor": { | ||
"normalization": { | ||
"technique": "min_max" | ||
}, | ||
"combination": { | ||
"technique": "arithmetic_mean" | ||
} | ||
} | ||
} | ||
] | ||
} | ||
}, | ||
{ | ||
"name": "semantic-search-hybrid-bm25-and-neural-search", | ||
"operation-type": "search", | ||
"request-params": { | ||
"search_pipeline": "nlp-min-max-arithmetic-search-pipeline" | ||
}, | ||
"variable-queries": {{variable_queries | default(0)}}, | ||
"param-source": "hybrid-query-bm25-neural-search-source", | ||
"body": { | ||
"_source": { | ||
"excludes": [ | ||
"passage_embedding" | ||
] | ||
}, | ||
"query": { | ||
"hybrid": { | ||
"queries": [ | ||
{ | ||
"match": { | ||
"title": "" | ||
} | ||
}, | ||
{ | ||
"neural": { | ||
"passage_embedding": { | ||
"query_text": "what types of rapid testing for Covid-19 have been developed?", | ||
"model_id": "", | ||
"k": {{k | default(10)}} | ||
} | ||
} | ||
} | ||
] | ||
} | ||
} | ||
} | ||
}, | ||
{ | ||
"name": "semantic-search-hybrid-bm25-and-knn-search", | ||
"operation-type": "search", | ||
"request-params": { | ||
"search_pipeline": "nlp-min-max-arithmetic-search-pipeline" | ||
}, | ||
"variable-queries": {{variable_queries | default(0)}}, | ||
"param-source": "hybrid-query-bm25-knn-search-source", | ||
"body": { | ||
"_source": { | ||
"excludes": [ | ||
"passage_embedding" | ||
] | ||
}, | ||
"query": { | ||
"hybrid": { | ||
"queries": [ | ||
{ | ||
"match": { | ||
"title": "" | ||
} | ||
}, | ||
{ | ||
"knn": { | ||
"passage_embedding": { | ||
"vector": "[1, 2, 3]", | ||
"k": {{k | default(100)}} | ||
} | ||
} | ||
} | ||
] | ||
} | ||
} | ||
} | ||
}, | ||
{ | ||
"name": "semantic-search-hybrid-bm25-range-and-neural-search", | ||
"operation-type": "search", | ||
"request-params": { | ||
"search_pipeline": "nlp-min-max-arithmetic-search-pipeline" | ||
}, | ||
"variable-queries": {{variable_queries | default(0)}}, | ||
"param-source": "hybrid-query-bm25-neural-search-source", | ||
"body": { | ||
"_source": { | ||
"excludes": [ | ||
"passage_embedding" | ||
] | ||
}, | ||
"query": { | ||
"hybrid": { | ||
"queries": [ | ||
{ | ||
"match": { | ||
"title": "" | ||
} | ||
}, | ||
{ | ||
"neural": { | ||
"passage_embedding": { | ||
"query_text": "what types of rapid testing for Covid-19 have been developed?", | ||
"model_id": "", | ||
"k": {{k | default(10)}} | ||
} | ||
} | ||
}, | ||
{ | ||
"nested": { | ||
"path": "metadata", | ||
"query": { | ||
"range": { | ||
"metadata.pubmed_id": { | ||
"gte": {{range_gte | default(100)}}, | ||
"lte": {{range_lte | default(10000000)}} | ||
} | ||
} | ||
} | ||
} | ||
} | ||
] | ||
} | ||
} | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
{ | ||
"bulk_indexing_clients": 4, | ||
"bulk_size": 200, | ||
"number_of_replicas": 1, | ||
"number_of_shards" :8, | ||
"ingest_percentage":100, | ||
"search_clients": 8, | ||
"warmup_iterations": 20, | ||
"iterations": 100, | ||
"variable_queries": 50, | ||
"k": 100, | ||
"only_run_on_ml_node" : "false" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"bulk_indexing_clients": 4, | ||
"bulk_size": 200, | ||
"number_of_replicas": 1, | ||
"number_of_shards" :8, | ||
"ingest_percentage":100, | ||
"search_clients": 8, | ||
"warmup_iterations": 20, | ||
"iterations": 100, | ||
"variable_queries": 50, | ||
"k": 100, | ||
"only_run_on_ml_node" : "false", | ||
"concurent_segment_search_enabled": "true" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
{ | ||
"bulk_indexing_clients": 4, | ||
"bulk_size": 200, | ||
"number_of_replicas": 1, | ||
"number_of_shards" :8, | ||
"ingest_percentage":100, | ||
"search_clients": 8, | ||
"warmup_iterations": 20, | ||
"iterations": 100, | ||
"variable_queries": 50, | ||
"k": 100, | ||
"only_run_on_ml_node" : "true" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
{ | ||
"bulk_indexing_clients": 4, | ||
"bulk_size": 200, | ||
"number_of_replicas": 1, | ||
"number_of_shards" :8, | ||
"ingest_percentage":100, | ||
"search_clients": 8, | ||
"warmup_iterations": 20, | ||
"iterations": 100, | ||
"variable_queries": 50, | ||
"k": 100, | ||
"only_run_on_ml_node" : "true", | ||
"concurent_segment_search_enabled": "true" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
# SPDX-License-Identifier: Apache-2.0 | ||
# | ||
# The OpenSearch Contributors require contributions made to | ||
# this file be licensed under the Apache-2.0 license or a | ||
# compatible open source license. | ||
|
||
from osbenchmark.worker_coordinator.runner import Retry, Runner | ||
from osbenchmark.client import RequestContextHolder | ||
|
||
# This runner class and registration is a temporary workaround while the next version of OSB is pending release | ||
def register(registry): | ||
registry.register_runner( | ||
UpdateConcurrentSegmentSearchSettings.RUNNER_NAME, | ||
Retry(UpdateConcurrentSegmentSearchSettings()), async_runner=True | ||
) | ||
|
||
request_context_holder = RequestContextHolder() | ||
|
||
class UpdateConcurrentSegmentSearchSettings(Runner): | ||
|
||
RUNNER_NAME = "update-concurrent-segment-search-settings" | ||
|
||
async def __call__(self, opensearch, params): | ||
enable_setting = params.get("enable", "false") | ||
max_slice_count = params.get("max_slice_count", None) | ||
body = { | ||
"persistent": { | ||
"search.concurrent_segment_search.enabled": enable_setting | ||
} | ||
} | ||
if max_slice_count is not None: | ||
body["persistent"]["search.concurrent.max_slice_count"] = max_slice_count | ||
request_context_holder.on_client_request_start() | ||
await opensearch.cluster.put_settings(body=body) | ||
request_context_holder.on_client_request_end() | ||
|
||
def __repr__(self, *args, **kwargs): | ||
Comment on lines
+19
to
+37
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. can this be moved to opensearch-benchmarks? |
||
return self.RUNNER_NAME |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
{ | ||
"operation": { | ||
"operation-type": "delete-ml-model", | ||
"model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}" | ||
} | ||
}, | ||
{ | ||
"operation": { | ||
"operation-type": "register-ml-model", | ||
"model-name": "{{ model_name | default('huggingface/sentence-transformers/all-mpnet-base-v2')}}", | ||
"model-version": "{{ model_version | default('1.0.1') }}", | ||
"model-format": "{{ model_format | default('TORCH_SCRIPT') }}", | ||
"model-config-file": "{{ model_config_file | default('') }}" | ||
} | ||
}, | ||
{ | ||
"operation": "deploy-ml-model" | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
{ | ||
"operation": "semantic-search-neural", | ||
"warmup-iterations": {{warmup_iterations | default(50) | tojson}}, | ||
"iterations": {{iterations | default(100) | tojson }}, | ||
"clients": {{ search_clients | default(1)}} | ||
}, | ||
{ | ||
"operation": "semantic-search-hybrid-bm25-and-knn-search", | ||
"warmup-iterations": {{warmup_iterations | default(50) | tojson}}, | ||
"iterations": {{iterations | default(100) | tojson }}, | ||
"clients": {{ search_clients | default(1)}} | ||
}, | ||
{ | ||
"operation": "semantic-search-hybrid-bm25-and-neural-search", | ||
"warmup-iterations": {{warmup_iterations | default(50) | tojson}}, | ||
"iterations": {{iterations | default(100) | tojson }}, | ||
"clients": {{ search_clients | default(1)}} | ||
}, | ||
{ | ||
"operation": "semantic-search-hybrid-bm25-range-and-neural-search", | ||
"warmup-iterations": {{warmup_iterations | default(50) | tojson}}, | ||
"iterations": {{iterations | default(100) | tojson }}, | ||
"clients": {{ search_clients | default(1)}} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
IMO, we can leave defaults to cluster. in other words, lets set this value if number_of_shards are provided in params