From 8e2ed18f1fe58cd9924a862d9dce9a232e2eb1ac Mon Sep 17 00:00:00 2001 From: Dicklesworthstone Date: Mon, 27 May 2024 19:13:00 -0400 Subject: [PATCH] Fix --- embeddings_data_models.py | 2 +- misc_utility_functions.py | 2 +- service_functions.py | 20 +++++++++++++-- swiss_army_llama.py | 53 +++++++++++++++++++++++++++++++-------- 4 files changed, 62 insertions(+), 15 deletions(-) diff --git a/embeddings_data_models.py b/embeddings_data_models.py index f81f29f..44a4dad 100644 --- a/embeddings_data_models.py +++ b/embeddings_data_models.py @@ -125,7 +125,7 @@ class AdvancedSemanticSearchRequest(BaseModel): result_sorting_metric: str = "hoeffding_d" @field_validator('result_sorting_metric') def validate_similarity_measure(cls, value): - valid_measures = ["all", "spearman_rho", "kendall_tau", "approximate_distance_correlation", "jensen_shannon_similarity", "hoeffding_d"] + valid_measures = ["spearman_rho", "kendall_tau", "approximate_distance_correlation", "jensen_shannon_similarity", "hoeffding_d"] if value.lower() not in valid_measures: raise ValueError(f"Invalid similarity measure. Supported measures are: {', '.join(valid_measures)}") return value.lower() diff --git a/misc_utility_functions.py b/misc_utility_functions.py index e993855..2f83fd7 100644 --- a/misc_utility_functions.py +++ b/misc_utility_functions.py @@ -119,7 +119,7 @@ def set_config(key, value): raise set_config('maxmemory', maxmemory) set_config('maxmemory-policy', 'allkeys-lru') - max_clients = os.cpu_count() * 1000 + max_clients = min(os.cpu_count() * 1000, 50000) set_config('maxclients', max_clients) set_config('timeout', 300) set_config('save', '900 1 300 10 60 10000') diff --git a/service_functions.py b/service_functions.py index 5951c30..268d365 100644 --- a/service_functions.py +++ b/service_functions.py @@ -289,7 +289,14 @@ async def calculate_sentence_embeddings_list(llama, texts: list, embedding_pooli if number_of_embeddings < min_components: padding = np.zeros((min_components - number_of_embeddings, dimension_of_token_embeddings)) embeddings = np.vstack([embeddings, padding]) - if embedding_pooling_method == "svd": + if embedding_pooling_method == "mean": + element_wise_mean = np.mean(embeddings, axis=0) + flattened_vector = element_wise_mean.flatten() + elif embedding_pooling_method == "mins_maxes": + element_wise_min = np.min(embeddings, axis=0) + element_wise_max = np.max(embeddings, axis=0) + flattened_vector = np.concatenate([element_wise_min, element_wise_max], axis=0) + elif embedding_pooling_method == "svd": svd = TruncatedSVD(n_components=2) svd_embeddings = svd.fit_transform(embeddings.T) flattened_vector = svd_embeddings.flatten() @@ -995,11 +1002,20 @@ def start_resource_monitoring(endpoint_name: str, input_data: Dict[str, Any], cl "num_characters_in_question": len(question), "llm_model_name": input_data.get("llm_model_name", ""), "temperature": input_data.get("temperature", 0.7), - "grammar_file_string": input_data.get("grammar_file_string", ""), "number_of_tokens_to_generate": input_data.get("number_of_tokens_to_generate", 256), "number_of_completions_to_generate": input_data.get("number_of_completions_to_generate", 1), "image_filename": input_data.get("image").filename if input_data.get("image") else "" } + elif endpoint_name == "advanced_search_stored_embeddings_with_query_string_for_semantic_similarity": + request_details = { + "query_text": input_data.get("query_text", ""), + "llm_model_name": input_data.get("llm_model_name", ""), + "embedding_pooling_method": input_data.get("embedding_pooling_method", ""), + "corpus_identifier_string": input_data.get("corpus_identifier_string", ""), + "similarity_filter_percentage": input_data.get("similarity_filter_percentage", 0.02), + "number_of_most_similar_strings_to_return": input_data.get("number_of_most_similar_strings_to_return", 10), + "result_sorting_metric": input_data.get("result_sorting_metric", "hoeffding_d") + } context = { "endpoint_name": endpoint_name, "start_time": start_time, diff --git a/swiss_army_llama.py b/swiss_army_llama.py index dcb10b7..70c8989 100644 --- a/swiss_army_llama.py +++ b/swiss_army_llama.py @@ -483,7 +483,7 @@ async def search_stored_embeddings_with_query_string_for_semantic_similarity(req llm_model_name = request.llm_model_name embedding_pooling_method = request.embedding_pooling_method num_results = request.number_of_most_similar_strings_to_return - num_results_before_corpus_filter = num_results*100 + num_results_before_corpus_filter = num_results*25 total_entries = len(associated_texts_by_model_and_pooling_method[llm_model_name][embedding_pooling_method]) # Get the total number of entries for the model and pooling method num_results = min(num_results, total_entries) # Ensure num_results doesn't exceed the total number of entries num_results_before_corpus_filter = min(num_results_before_corpus_filter, total_entries) # Ensure num_results_before_corpus_filter doesn't exceed the total number of entries @@ -591,6 +591,7 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil lock = await shared_resources.lock_manager.lock(unique_id) if lock.valid: try: + context = start_resource_monitoring("advanced_search_stored_embeddings_with_query_string_for_semantic_similarity", request.dict(), req.client.host if req else "localhost") faiss_indexes, associated_texts_by_model_and_pooling_method = await build_faiss_indexes(force_rebuild=True) try: faiss_index = faiss_indexes[(request.llm_model_name, request.embedding_pooling_method)] @@ -598,7 +599,7 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil raise HTTPException(status_code=400, detail=f"No FAISS index found for model: {request.llm_model_name} and pooling method: {request.embedding_pooling_method}") llm_model_name = request.llm_model_name embedding_pooling_method = request.embedding_pooling_method - num_results_before_corpus_filter = request.number_of_most_similar_strings_to_return * 100 + num_results_before_corpus_filter = request.number_of_most_similar_strings_to_return*25 logger.info(f"Received request to find most similar strings for query text: `{request.query_text}` using model: {llm_model_name}") try: logger.info(f"Computing embedding for input text: {request.query_text}") @@ -654,6 +655,7 @@ async def advanced_search_stored_embeddings_with_query_string_for_semantic_simil raise HTTPException(status_code=500, detail="Internal Server Error") finally: await shared_resources.lock_manager.unlock(lock) + end_resource_monitoring(context) else: return {"status": "already processing"} @@ -766,13 +768,26 @@ async def get_all_embedding_vectors_for_document( if len(json_content) == 0: raise HTTPException(status_code=400, detail="Could not retrieve document embedding results.") existing_document = 1 + document_embedding_request = {} else: + document_embedding_request = {} existing_document = 0 with open(temp_file_path, 'rb') as f: input_data_binary = f.read() result = magika.identify_bytes(input_data_binary) mime_type = result.output.mime_type sentences, thousands_of_input_words = await parse_submitted_document_file_into_sentence_strings_func(temp_file_path, mime_type) + document_embedding_request['mime_type'] = mime_type + document_embedding_request['sentences'] = sentences + document_embedding_request['total_number_of_sentences'] = len(sentences) + document_embedding_request['total_words'] = sum(len(sentence.split()) for sentence in sentences) + document_embedding_request['total_characters'] = sum(len(sentence) for sentence in sentences) + document_embedding_request['thousands_of_input_words'] = thousands_of_input_words + document_embedding_request['file_size_mb'] = os.path.getsize(temp_file_path) / (1024 * 1024) + document_embedding_request['corpus_identifier_string'] = corpus_identifier_string + document_embedding_request['embedding_pooling_method'] = embedding_pooling_method + document_embedding_request['llm_model_name'] = llm_model_name + document_embedding_request['document_file_hash'] = document_file_hash if thousands_of_input_words > MAX_THOUSANDS_OF_WORDs_FOR_DOCUMENT_EMBEDDING: raise HTTPException(status_code=400, detail=f"Document contains ~{int(thousands_of_input_words*1000):,}, more than the maximum of {MAX_THOUSANDS_OF_WORDs_FOR_DOCUMENT_EMBEDDING*1000:,} words, which would take too long to compute embeddings for. Please submit a smaller document.") first_10_words_of_input_text = ' '.join(' '.join(sentences).split()[:10]) @@ -787,29 +802,45 @@ async def get_all_embedding_vectors_for_document( try: json_content = await compute_embeddings_for_document(sentences=sentences, llm_model_name=llm_model_name, embedding_pooling_method=embedding_pooling_method, corpus_identifier_string=corpus_identifier_string, client_ip=client_ip, document_file_hash=document_file_hash, file=file, original_file_content=input_data_binary, json_format=json_format) logger.info(f"Done getting all regular embeddings for document containing {len(sentences):,} sentences with model {llm_model_name} and embedding pooling method {embedding_pooling_method} and corpus {corpus_identifier_string}") + except Exception as e: logger.error(f"Error while computing embeddings for document: {e}") traceback.print_exc() raise HTTPException(status_code=400, detail="Error while computing embeddings for document") finally: end_resource_monitoring(context) - - if query_text: + if query_text: + use_advanced_semantic_search = 0 + if use_advanced_semantic_search: + search_request = AdvancedSemanticSearchRequest( + query_text=query_text, + llm_model_name=llm_model_name, + embedding_pooling_method=embedding_pooling_method, + corpus_identifier_string=corpus_identifier_string, + similarity_filter_percentage=0.01, + result_sorting_metric="hoeffding_d", + number_of_most_similar_strings_to_return=10 + ) + logger.info(f"Performing advanced semantic search for model {llm_model_name} and pooling method {embedding_pooling_method}...") + search_response = await advanced_search_stored_embeddings_with_query_string_for_semantic_similarity(search_request, req, token) + search_results = search_response["results"] + else: search_request = SemanticSearchRequest( query_text=query_text, llm_model_name=llm_model_name, embedding_pooling_method=embedding_pooling_method, corpus_identifier_string=corpus_identifier_string, - number_of_most_similar_strings_to_return=15 + number_of_most_similar_strings_to_return=10 ) + logger.info(f"Performing semantic search for model {llm_model_name} and pooling method {embedding_pooling_method}...") search_response = await search_stored_embeddings_with_query_string_for_semantic_similarity(search_request, req, token) search_results = search_response["results"] - json_content_dict = {"document_embedding_results": json.loads(json_content), "semantic_search_results": search_results} - json_content = json.dumps(json_content_dict) - else: - json_content_dict = {"document_embedding_results": json.loads(json_content)} - json_content = json.dumps(json_content_dict) - + logger.info(f"Advanced semantic search completed. Results for query text '{query_text}'\n: {search_results}") + json_content_dict = {"document_embedding_request": document_embedding_request, "document_embedding_results": json.loads(json_content), "semantic_search_request": dict(search_request), "semantic_search_results": search_results} + json_content = json.dumps(json_content_dict) + else: + json_content_dict = {"document_embedding_request": document_embedding_request, "document_embedding_results": json.loads(json_content)} + json_content = json.dumps(json_content_dict) overall_total_time = (datetime.utcnow() - request_time).total_seconds() json_content_length = len(json_content) if json_content_length > 0: