Skip to content

Commit

Permalink
Fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Dicklesworthstone committed May 27, 2024
1 parent 900464f commit 8e2ed18
Show file tree
Hide file tree
Showing 4 changed files with 62 additions and 15 deletions.
2 changes: 1 addition & 1 deletion embeddings_data_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ class AdvancedSemanticSearchRequest(BaseModel):
result_sorting_metric: str = "hoeffding_d"
@field_validator('result_sorting_metric')
def validate_similarity_measure(cls, value):
valid_measures = ["all", "spearman_rho", "kendall_tau", "approximate_distance_correlation", "jensen_shannon_similarity", "hoeffding_d"]
valid_measures = ["spearman_rho", "kendall_tau", "approximate_distance_correlation", "jensen_shannon_similarity", "hoeffding_d"]
if value.lower() not in valid_measures:
raise ValueError(f"Invalid similarity measure. Supported measures are: {', '.join(valid_measures)}")
return value.lower()
Expand Down
2 changes: 1 addition & 1 deletion misc_utility_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def set_config(key, value):
raise
set_config('maxmemory', maxmemory)
set_config('maxmemory-policy', 'allkeys-lru')
max_clients = os.cpu_count() * 1000
max_clients = min(os.cpu_count() * 1000, 50000)
set_config('maxclients', max_clients)
set_config('timeout', 300)
set_config('save', '900 1 300 10 60 10000')
Expand Down
20 changes: 18 additions & 2 deletions service_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,14 @@ async def calculate_sentence_embeddings_list(llama, texts: list, embedding_pooli
if number_of_embeddings < min_components:
padding = np.zeros((min_components - number_of_embeddings, dimension_of_token_embeddings))
embeddings = np.vstack([embeddings, padding])
if embedding_pooling_method == "svd":
if embedding_pooling_method == "mean":
element_wise_mean = np.mean(embeddings, axis=0)
flattened_vector = element_wise_mean.flatten()
elif embedding_pooling_method == "mins_maxes":
element_wise_min = np.min(embeddings, axis=0)
element_wise_max = np.max(embeddings, axis=0)
flattened_vector = np.concatenate([element_wise_min, element_wise_max], axis=0)
elif embedding_pooling_method == "svd":
svd = TruncatedSVD(n_components=2)
svd_embeddings = svd.fit_transform(embeddings.T)
flattened_vector = svd_embeddings.flatten()
Expand Down Expand Up @@ -995,11 +1002,20 @@ def start_resource_monitoring(endpoint_name: str, input_data: Dict[str, Any], cl
"num_characters_in_question": len(question),
"llm_model_name": input_data.get("llm_model_name", ""),
"temperature": input_data.get("temperature", 0.7),
"grammar_file_string": input_data.get("grammar_file_string", ""),
"number_of_tokens_to_generate": input_data.get("number_of_tokens_to_generate", 256),
"number_of_completions_to_generate": input_data.get("number_of_completions_to_generate", 1),
"image_filename": input_data.get("image").filename if input_data.get("image") else ""
}
elif endpoint_name == "advanced_search_stored_embeddings_with_query_string_for_semantic_similarity":
request_details = {
"query_text": input_data.get("query_text", ""),
"llm_model_name": input_data.get("llm_model_name", ""),
"embedding_pooling_method": input_data.get("embedding_pooling_method", ""),
"corpus_identifier_string": input_data.get("corpus_identifier_string", ""),
"similarity_filter_percentage": input_data.get("similarity_filter_percentage", 0.02),
"number_of_most_similar_strings_to_return": input_data.get("number_of_most_similar_strings_to_return", 10),
"result_sorting_metric": input_data.get("result_sorting_metric", "hoeffding_d")
}
context = {
"endpoint_name": endpoint_name,
"start_time": start_time,
Expand Down
53 changes: 42 additions & 11 deletions swiss_army_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -483,7 +483,7 @@ async def search_stored_embeddings_with_query_string_for_semantic_similarity(req
llm_model_name = request.llm_model_name
embedding_pooling_method = request.embedding_pooling_method
num_results = request.number_of_most_similar_strings_to_return
num_results_before_corpus_filter = num_results*100
num_results_before_corpus_filter = num_results*25
total_entries = len(associated_texts_by_model_and_pooling_method[llm_model_name][embedding_pooling_method]) # Get the total number of entries for the model and pooling method
num_results = min(num_results, total_entries) # Ensure num_results doesn't exceed the total number of entries
num_results_before_corpus_filter = min(num_results_before_corpus_filter, total_entries) # Ensure num_results_before_corpus_filter doesn't exceed the total number of entries
Expand Down Expand Up @@ -591,14 +591,15 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil
lock = await shared_resources.lock_manager.lock(unique_id)
if lock.valid:
try:
context = start_resource_monitoring("advanced_search_stored_embeddings_with_query_string_for_semantic_similarity", request.dict(), req.client.host if req else "localhost")
faiss_indexes, associated_texts_by_model_and_pooling_method = await build_faiss_indexes(force_rebuild=True)
try:
faiss_index = faiss_indexes[(request.llm_model_name, request.embedding_pooling_method)]
except KeyError:
raise HTTPException(status_code=400, detail=f"No FAISS index found for model: {request.llm_model_name} and pooling method: {request.embedding_pooling_method}")
llm_model_name = request.llm_model_name
embedding_pooling_method = request.embedding_pooling_method
num_results_before_corpus_filter = request.number_of_most_similar_strings_to_return * 100
num_results_before_corpus_filter = request.number_of_most_similar_strings_to_return*25
logger.info(f"Received request to find most similar strings for query text: `{request.query_text}` using model: {llm_model_name}")
try:
logger.info(f"Computing embedding for input text: {request.query_text}")
Expand Down Expand Up @@ -654,6 +655,7 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil
raise HTTPException(status_code=500, detail="Internal Server Error")
finally:
await shared_resources.lock_manager.unlock(lock)
end_resource_monitoring(context)
else:
return {"status": "already processing"}

Expand Down Expand Up @@ -766,13 +768,26 @@ async def get_all_embedding_vectors_for_document(
if len(json_content) == 0:
raise HTTPException(status_code=400, detail="Could not retrieve document embedding results.")
existing_document = 1
document_embedding_request = {}
else:
document_embedding_request = {}
existing_document = 0
with open(temp_file_path, 'rb') as f:
input_data_binary = f.read()
result = magika.identify_bytes(input_data_binary)
mime_type = result.output.mime_type
sentences, thousands_of_input_words = await parse_submitted_document_file_into_sentence_strings_func(temp_file_path, mime_type)
document_embedding_request['mime_type'] = mime_type
document_embedding_request['sentences'] = sentences
document_embedding_request['total_number_of_sentences'] = len(sentences)
document_embedding_request['total_words'] = sum(len(sentence.split()) for sentence in sentences)
document_embedding_request['total_characters'] = sum(len(sentence) for sentence in sentences)
document_embedding_request['thousands_of_input_words'] = thousands_of_input_words
document_embedding_request['file_size_mb'] = os.path.getsize(temp_file_path) / (1024 * 1024)
document_embedding_request['corpus_identifier_string'] = corpus_identifier_string
document_embedding_request['embedding_pooling_method'] = embedding_pooling_method
document_embedding_request['llm_model_name'] = llm_model_name
document_embedding_request['document_file_hash'] = document_file_hash
if thousands_of_input_words > MAX_THOUSANDS_OF_WORDs_FOR_DOCUMENT_EMBEDDING:
raise HTTPException(status_code=400, detail=f"Document contains ~{int(thousands_of_input_words*1000):,}, more than the maximum of {MAX_THOUSANDS_OF_WORDs_FOR_DOCUMENT_EMBEDDING*1000:,} words, which would take too long to compute embeddings for. Please submit a smaller document.")
first_10_words_of_input_text = ' '.join(' '.join(sentences).split()[:10])
Expand All @@ -787,29 +802,45 @@ async def get_all_embedding_vectors_for_document(
try:
json_content = await compute_embeddings_for_document(sentences=sentences, llm_model_name=llm_model_name, embedding_pooling_method=embedding_pooling_method, corpus_identifier_string=corpus_identifier_string, client_ip=client_ip, document_file_hash=document_file_hash, file=file, original_file_content=input_data_binary, json_format=json_format)
logger.info(f"Done getting all regular embeddings for document containing {len(sentences):,} sentences with model {llm_model_name} and embedding pooling method {embedding_pooling_method} and corpus {corpus_identifier_string}")

except Exception as e:
logger.error(f"Error while computing embeddings for document: {e}")
traceback.print_exc()
raise HTTPException(status_code=400, detail="Error while computing embeddings for document")
finally:
end_resource_monitoring(context)

if query_text:
if query_text:
use_advanced_semantic_search = 0
if use_advanced_semantic_search:
search_request = AdvancedSemanticSearchRequest(
query_text=query_text,
llm_model_name=llm_model_name,
embedding_pooling_method=embedding_pooling_method,
corpus_identifier_string=corpus_identifier_string,
similarity_filter_percentage=0.01,
result_sorting_metric="hoeffding_d",
number_of_most_similar_strings_to_return=10
)
logger.info(f"Performing advanced semantic search for model {llm_model_name} and pooling method {embedding_pooling_method}...")
search_response = await advanced_search_stored_embeddings_with_query_string_for_semantic_similarity(search_request, req, token)
search_results = search_response["results"]
else:
search_request = SemanticSearchRequest(
query_text=query_text,
llm_model_name=llm_model_name,
embedding_pooling_method=embedding_pooling_method,
corpus_identifier_string=corpus_identifier_string,
number_of_most_similar_strings_to_return=15
number_of_most_similar_strings_to_return=10
)
logger.info(f"Performing semantic search for model {llm_model_name} and pooling method {embedding_pooling_method}...")
search_response = await search_stored_embeddings_with_query_string_for_semantic_similarity(search_request, req, token)
search_results = search_response["results"]
json_content_dict = {"document_embedding_results": json.loads(json_content), "semantic_search_results": search_results}
json_content = json.dumps(json_content_dict)
else:
json_content_dict = {"document_embedding_results": json.loads(json_content)}
json_content = json.dumps(json_content_dict)

logger.info(f"Advanced semantic search completed. Results for query text '{query_text}'\n: {search_results}")
json_content_dict = {"document_embedding_request": document_embedding_request, "document_embedding_results": json.loads(json_content), "semantic_search_request": dict(search_request), "semantic_search_results": search_results}
json_content = json.dumps(json_content_dict)
else:
json_content_dict = {"document_embedding_request": document_embedding_request, "document_embedding_results": json.loads(json_content)}
json_content = json.dumps(json_content_dict)
overall_total_time = (datetime.utcnow() - request_time).total_seconds()
json_content_length = len(json_content)
if json_content_length > 0:
Expand Down

0 comments on commit 8e2ed18

Please sign in to comment.