From e057db85e9e1574fe54703cdc9880c7877ac439a Mon Sep 17 00:00:00 2001 From: HamadaSalhab Date: Wed, 23 Oct 2024 16:28:52 +0300 Subject: [PATCH 1/6] Optimize NLP processing for improved performance --- agents-api/agents_api/common/nlp.py | 355 +++++++++++++++------------- 1 file changed, 186 insertions(+), 169 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index a2f2f17ea..89b06a2c1 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -1,221 +1,238 @@ import re from collections import Counter, defaultdict +from functools import lru_cache +from typing import Dict, List, Set, Tuple import spacy +from spacy.matcher import PhraseMatcher +from spacy.tokens import Doc +from spacy.util import filter_spans -# Load spaCy English model +# Precompile regex patterns +WHITESPACE_RE = re.compile(r"\s+") +NON_ALPHANUM_RE = re.compile(r"[^\w\s\-_]+") + +# Initialize spaCy with minimal pipeline spacy.prefer_gpu() -nlp = spacy.load("en_core_web_sm") +nlp = spacy.load( + "en_core_web_sm", + disable=["lemmatizer", "textcat", "vector"], # Disable unused components +) +# Singleton PhraseMatcher for better performance -def extract_keywords(text: str, top_n: int = 10, clean: bool = True) -> list[str]: - """ - Extracts significant keywords and phrases from the text. - Args: - text (str): The input text to process. - top_n (int): Number of top keywords to extract based on frequency. - clean (bool): Strip non-alphanumeric characters from keywords. +class KeywordMatcher: + _instance = None - Returns: - List[str]: A list of extracted keywords/phrases. - """ - doc = nlp(text) - - # Extract named entities - entities = [ - ent.text.strip() - for ent in doc.ents - if ent.label_ - not in ["DATE", "TIME", "PERCENT", "MONEY", "QUANTITY", "ORDINAL", "CARDINAL"] - ] + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + cls._instance.matcher = PhraseMatcher(nlp.vocab, attr="LOWER") + cls._instance.batch_size = 1000 + cls._instance.patterns_cache = {} + return cls._instance - # Extract nouns and proper nouns - nouns = [ - chunk.text.strip().lower() - for chunk in doc.noun_chunks - if not chunk.root.is_stop - ] + @lru_cache(maxsize=10000) + def _create_pattern(self, text: str) -> Doc: + return nlp.make_doc(text) - # Combine entities and nouns - combined = entities + nouns + def find_matches(self, doc: Doc, keywords: List[str]) -> Dict[str, List[int]]: + """Batch process keywords for better performance.""" + keyword_positions = defaultdict(list) - # Normalize and count frequency - normalized = [re.sub(r"\s+", " ", kw).strip() for kw in combined] - freq = Counter(normalized) + # Process keywords in batches to avoid memory issues + for i in range(0, len(keywords), self.batch_size): + batch = keywords[i : i + self.batch_size] + patterns = [self._create_pattern(kw) for kw in batch] - # Get top_n keywords - keywords = [item for item, count in freq.most_common(top_n)] + # Clear previous patterns and add new batch + if "KEYWORDS" in self.matcher: + self.matcher.remove("KEYWORDS") + self.matcher.add("KEYWORDS", patterns) - if clean: - keywords = [re.sub(r"[^\w\s\-_]+", "", kw) for kw in keywords] + # Find matches for this batch + matches = self.matcher(doc) + for match_id, start, end in matches: + span_text = doc[start:end].text + normalized = WHITESPACE_RE.sub(" ", span_text).lower().strip() + keyword_positions[normalized].append(start) - return keywords + return keyword_positions -def find_keyword_positions(doc, keyword: str) -> list[int]: - """ - Finds all start indices of the keyword in the tokenized doc. +# Initialize global matcher +keyword_matcher = KeywordMatcher() - Args: - doc (spacy.tokens.Doc): The tokenized document. - keyword (str): The keyword or phrase to search for. - Returns: - List[int]: List of starting token indices where the keyword appears. - """ - keyword_tokens = keyword.split() - n = len(keyword_tokens) - positions = [] - for i in range(len(doc) - n + 1): - window = doc[i : i + n] - window_text = " ".join([token.text.lower() for token in window]) - if window_text == keyword: - positions.append(i) - return positions +@lru_cache(maxsize=10000) +def clean_keyword(kw: str) -> str: + """Cache cleaned keywords for reuse.""" + return NON_ALPHANUM_RE.sub("", kw).strip() -def find_proximity_groups( - text: str, keywords: list[str], n: int = 10 -) -> list[set[str]]: - """ - Groups keywords that appear within n words of each other. +def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> List[str]: + """Optimized keyword extraction with minimal behavior change.""" + excluded_labels = { + "DATE", + "TIME", + "PERCENT", + "MONEY", + "QUANTITY", + "ORDINAL", + "CARDINAL", + } - Args: - text (str): The input text. - keywords (List[str]): List of keywords to consider. - n (int): The proximity window in words. + # Extract and filter spans in a single pass + ent_spans = [ent for ent in doc.ents if ent.label_ not in excluded_labels] + chunk_spans = [chunk for chunk in doc.noun_chunks if not chunk.root.is_stop] + all_spans = filter_spans(ent_spans + chunk_spans) - Returns: - List[Set[str]]: List of sets, each containing keywords that are proximate. - """ - doc = nlp(text.lower()) - keyword_positions = defaultdict(list) + # Process spans efficiently + keywords = [] + seen_texts = set() - for kw in keywords: - positions = find_keyword_positions(doc, kw) - keyword_positions[kw].extend(positions) - - # Initialize Union-Find structure - parent = {} - - def find(u): - while parent[u] != u: - parent[u] = parent[parent[u]] - u = parent[u] - return u - - def union(u, v): - u_root = find(u) - v_root = find(v) - if u_root == v_root: - return - parent[v_root] = u_root - - # Initialize each keyword as its own parent - for kw in keywords: - parent[kw] = kw - - # Compare all pairs of keywords - for i in range(len(keywords)): - for j in range(i + 1, len(keywords)): - kw1 = keywords[i] - kw2 = keywords[j] - positions1 = keyword_positions[kw1] - positions2 = keyword_positions[kw2] - # Check if any positions are within n words - for pos1 in positions1: - for pos2 in positions2: - distance = abs(pos1 - pos2) - if distance <= n: - union(kw1, kw2) - break - else: - continue - break - - # Group keywords by their root parent + for span in all_spans: + text = span.text.strip() + lower_text = text.lower() + + # Skip empty or seen texts + if not text or lower_text in seen_texts: + continue + + seen_texts.add(lower_text) + keywords.append(text) + + # Normalize keywords by replacing multiple spaces with single space and stripping + normalized_keywords = [WHITESPACE_RE.sub(" ", kw).strip() for kw in keywords] + + # Count frequencies efficiently + freq = Counter(normalized_keywords) + top_keywords = [kw for kw, _ in freq.most_common(top_n)] + + if clean: + return [clean_keyword(kw) for kw in top_keywords] + return top_keywords + + +def find_proximity_groups( + keywords: List[str], keyword_positions: Dict[str, List[int]], n: int = 10 +) -> List[Set[str]]: + """Optimized proximity grouping using sorted positions.""" + # Early return for single or no keywords + if len(keywords) <= 1: + return [{kw} for kw in keywords] + + # Create flat list of positions for efficient processing + positions: List[Tuple[int, str]] = [ + (pos, kw) for kw in keywords for pos in keyword_positions[kw] + ] + + # Sort positions once + positions.sort() + + # Initialize Union-Find with path compression and union by rank + parent = {kw: kw for kw in keywords} + rank = {kw: 0 for kw in keywords} + + def find(u: str) -> str: + if parent[u] != u: + parent[u] = find(parent[u]) + return parent[u] + + def union(u: str, v: str) -> None: + u_root, v_root = find(u), find(v) + if u_root != v_root: + if rank[u_root] < rank[v_root]: + u_root, v_root = v_root, u_root + parent[v_root] = u_root + if rank[u_root] == rank[v_root]: + rank[u_root] += 1 + + # Use sliding window for proximity checking + window = [] + for pos, kw in positions: + # Remove positions outside window + while window and pos - window[0][0] > n: + window.pop(0) + + # Union with all keywords in window + for _, w_kw in window: + union(kw, w_kw) + + window.append((pos, kw)) + + # Group keywords efficiently groups = defaultdict(set) for kw in keywords: root = find(kw) groups[root].add(kw) - # Convert to list of sets - group_list = list(groups.values()) - - return group_list + return list(groups.values()) -def build_query(groups: list[set[str]], keywords: list[str], n: int = 10) -> str: - """ - Builds a query string using the custom query language. +@lru_cache(maxsize=100) +def build_query_pattern(group_size: int, n: int) -> str: + """Cache query patterns for common group sizes.""" + if group_size == 1: + return '"{}"' + return f"NEAR/{n}(" + " ".join('"{}"' for _ in range(group_size)) + ")" - Args: - groups (List[Set[str]]): List of keyword groups. - keywords (List[str]): Original list of keywords. - n (int): The proximity window for NEAR. - Returns: - str: The constructed query string. - """ - grouped_keywords = set() +def build_query(groups: List[Set[str]], n: int = 10) -> str: + """Build query with cached patterns.""" clauses = [] for group in groups: if len(group) == 1: - clauses.append(f'"{list(group)[0]}"') + clauses.append(f'"{next(iter(group))}"') else: - sorted_group = sorted( - group, key=lambda x: -len(x) - ) # Sort by length to prioritize phrases - escaped_keywords = [f'"{kw}"' for kw in sorted_group] - near_clause = f"NEAR/{n}(" + " ".join(escaped_keywords) + ")" - clauses.append(near_clause) - grouped_keywords.update(group) - - # Identify keywords not in any group (if any) - remaining = set(keywords) - grouped_keywords - for kw in remaining: - clauses.append(f'"{kw}"') - - # Combine all clauses with OR - query = " OR ".join(clauses) + # Sort by length descending to prioritize longer phrases + sorted_group = sorted(group, key=len, reverse=True) + # Get cached pattern and format with keywords + pattern = build_query_pattern(len(group), n) + clause = pattern.format(*sorted_group) + clauses.append(clause) - return query + return " OR ".join(clauses) -def text_to_custom_query(text: str, top_n: int = 10, proximity_n: int = 10) -> str: +def paragraph_to_custom_queries( + paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1 +) -> List[str]: """ - Converts arbitrary text to the custom query language. + Optimized paragraph processing with minimal behavior changes. + Added min_keywords parameter to filter out low-value queries. + """ + if not paragraph or not paragraph.strip(): + return [] - Args: - text (str): The input text to convert. - top_n (int): Number of top keywords to extract. - proximity_n (int): The proximity window for NEAR/n. + # Process entire paragraph once + doc = nlp(paragraph) + queries = [] - Returns: - str: The custom query string. - """ - keywords = extract_keywords(text, top_n) - if not keywords: - return "" - groups = find_proximity_groups(text, keywords, proximity_n) - query = build_query(groups, keywords, proximity_n) - return query + # Process sentences + for sent in doc.sents: + # Convert to doc for consistent API + sent_doc = sent.as_doc() + # Extract and clean keywords + keywords = extract_keywords(sent_doc, top_n) + if len(keywords) < min_keywords: + continue -def paragraph_to_custom_queries(paragraph: str) -> list[str]: - """ - Converts a paragraph to a list of custom query strings. + # Find keyword positions using matcher + keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) - Args: - paragraph (str): The input paragraph to convert. + # Skip if no keywords found in positions + if not keyword_positions: + continue - Returns: - List[str]: The list of custom query strings. - """ + # Find proximity groups and build query + groups = find_proximity_groups(keywords, keyword_positions, proximity_n) + query = build_query(groups, proximity_n) - queries = [text_to_custom_query(sentence.text) for sentence in nlp(paragraph).sents] - queries = [q for q in queries if q] + if query: + queries.append(query) return queries From 3fd92aa7b3f1fab0331f2a1200de974499976d74 Mon Sep 17 00:00:00 2001 From: HamadaSalhab Date: Wed, 23 Oct 2024 23:27:23 +0300 Subject: [PATCH 2/6] feat(agents-api): Add gunicorn & uvloop --- agents-api/Dockerfile | 2 +- agents-api/Dockerfile.worker | 2 +- agents-api/agents_api/web.py | 6 +-- agents-api/gunicorn_conf.py | 9 +++++ agents-api/poetry.lock | 74 +++++++++++++++++++++++++++++++++++- agents-api/pyproject.toml | 2 + 6 files changed, 89 insertions(+), 6 deletions(-) create mode 100644 agents-api/gunicorn_conf.py diff --git a/agents-api/Dockerfile b/agents-api/Dockerfile index bd4e29188..bee0a68d5 100644 --- a/agents-api/Dockerfile +++ b/agents-api/Dockerfile @@ -41,4 +41,4 @@ RUN poetry install --no-dev --no-root COPY . ./ -ENTRYPOINT ["python", "-m", "agents_api.web", "--host", "0.0.0.0", "--port", "8080"] +ENTRYPOINT ["gunicorn", "agents_api.web:app", "-c", "gunicorn_conf.py"] \ No newline at end of file diff --git a/agents-api/Dockerfile.worker b/agents-api/Dockerfile.worker index 81fa7d2cd..5dc2f8254 100644 --- a/agents-api/Dockerfile.worker +++ b/agents-api/Dockerfile.worker @@ -43,4 +43,4 @@ COPY . ./ RUN poetry install --no-dev -ENTRYPOINT ["python", "-m", "agents_api.worker"] +ENTRYPOINT ["gunicorn", "agents_api.worker:app", "-c", "gunicorn_conf.py"] diff --git a/agents-api/agents_api/web.py b/agents-api/agents_api/web.py index 56bdbb48a..54cd80425 100644 --- a/agents-api/agents_api/web.py +++ b/agents-api/agents_api/web.py @@ -2,6 +2,8 @@ This module initializes the FastAPI application, registers routes, sets up middleware, and configures exception handlers. """ +import asyncio +import uvloop import logging from typing import Any, Callable @@ -207,6 +209,4 @@ def main( ) -# Check if the script is being run directly and, if so, start the Uvicorn server with the specified configuration. -if __name__ == "__main__": - fire.Fire(main) +asyncio.set_event_loop_policy(uvloop.EventLoopPolicy()) diff --git a/agents-api/gunicorn_conf.py b/agents-api/gunicorn_conf.py new file mode 100644 index 000000000..1ecc02d9c --- /dev/null +++ b/agents-api/gunicorn_conf.py @@ -0,0 +1,9 @@ +import multiprocessing + +# Gunicorn config variables +workers = multiprocessing.cpu_count() * 2 + 1 +worker_class = "uvicorn.workers.UvicornWorker" +bind = "0.0.0.0:8080" +keepalive = 120 +errorlog = "-" +accesslog = "-" diff --git a/agents-api/poetry.lock b/agents-api/poetry.lock index c6507586a..1f5c96c37 100644 --- a/agents-api/poetry.lock +++ b/agents-api/poetry.lock @@ -1373,6 +1373,27 @@ files = [ {file = "google_re2-1.1.20240702.tar.gz", hash = "sha256:8788db69f6c93cb229df62c74b2d9aa8e64bf754e9495700f85812afa32efd2b"}, ] +[[package]] +name = "gunicorn" +version = "23.0.0" +description = "WSGI HTTP Server for UNIX" +optional = false +python-versions = ">=3.7" +files = [ + {file = "gunicorn-23.0.0-py3-none-any.whl", hash = "sha256:ec400d38950de4dfd418cff8328b2c8faed0edb0d517d3394e457c317908ca4d"}, + {file = "gunicorn-23.0.0.tar.gz", hash = "sha256:f014447a0101dc57e294f6c18ca6b40227a4c90e9bdb586042628030cba004ec"}, +] + +[package.dependencies] +packaging = "*" + +[package.extras] +eventlet = ["eventlet (>=0.24.1,!=0.36.0)"] +gevent = ["gevent (>=1.4.0)"] +setproctitle = ["setproctitle"] +testing = ["coverage", "eventlet", "gevent", "pytest", "pytest-cov"] +tornado = ["tornado (>=0.2)"] + [[package]] name = "h11" version = "0.14.0" @@ -5424,6 +5445,57 @@ h11 = ">=0.8" [package.extras] standard = ["colorama (>=0.4)", "httptools (>=0.5.0)", "python-dotenv (>=0.13)", "pyyaml (>=5.1)", "uvloop (>=0.14.0,!=0.15.0,!=0.15.1)", "watchfiles (>=0.13)", "websockets (>=10.4)"] +[[package]] +name = "uvloop" +version = "0.21.0" +description = "Fast implementation of asyncio event loop on top of libuv" +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f"}, + {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d"}, + {file = "uvloop-0.21.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f38b2e090258d051d68a5b14d1da7203a3c3677321cf32a95a6f4db4dd8b6f26"}, + {file = "uvloop-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87c43e0f13022b998eb9b973b5e97200c8b90823454d4bc06ab33829e09fb9bb"}, + {file = "uvloop-0.21.0-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:10d66943def5fcb6e7b37310eb6b5639fd2ccbc38df1177262b0640c3ca68c1f"}, + {file = "uvloop-0.21.0-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:67dd654b8ca23aed0a8e99010b4c34aca62f4b7fce88f39d452ed7622c94845c"}, + {file = "uvloop-0.21.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:c0f3fa6200b3108919f8bdabb9a7f87f20e7097ea3c543754cabc7d717d95cf8"}, + {file = "uvloop-0.21.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:0878c2640cf341b269b7e128b1a5fed890adc4455513ca710d77d5e93aa6d6a0"}, + {file = "uvloop-0.21.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b9fb766bb57b7388745d8bcc53a359b116b8a04c83a2288069809d2b3466c37e"}, + {file = "uvloop-0.21.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8a375441696e2eda1c43c44ccb66e04d61ceeffcd76e4929e527b7fa401b90fb"}, + {file = "uvloop-0.21.0-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:baa0e6291d91649c6ba4ed4b2f982f9fa165b5bbd50a9e203c416a2797bab3c6"}, + {file = "uvloop-0.21.0-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:4509360fcc4c3bd2c70d87573ad472de40c13387f5fda8cb58350a1d7475e58d"}, + {file = "uvloop-0.21.0-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:359ec2c888397b9e592a889c4d72ba3d6befba8b2bb01743f72fffbde663b59c"}, + {file = "uvloop-0.21.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:f7089d2dc73179ce5ac255bdf37c236a9f914b264825fdaacaded6990a7fb4c2"}, + {file = "uvloop-0.21.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:baa4dcdbd9ae0a372f2167a207cd98c9f9a1ea1188a8a526431eef2f8116cc8d"}, + {file = "uvloop-0.21.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:86975dca1c773a2c9864f4c52c5a55631038e387b47eaf56210f873887b6c8dc"}, + {file = "uvloop-0.21.0-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:461d9ae6660fbbafedd07559c6a2e57cd553b34b0065b6550685f6653a98c1cb"}, + {file = "uvloop-0.21.0-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:183aef7c8730e54c9a3ee3227464daed66e37ba13040bb3f350bc2ddc040f22f"}, + {file = "uvloop-0.21.0-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:bfd55dfcc2a512316e65f16e503e9e450cab148ef11df4e4e679b5e8253a5281"}, + {file = "uvloop-0.21.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787ae31ad8a2856fc4e7c095341cccc7209bd657d0e71ad0dc2ea83c4a6fa8af"}, + {file = "uvloop-0.21.0-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:5ee4d4ef48036ff6e5cfffb09dd192c7a5027153948d85b8da7ff705065bacc6"}, + {file = "uvloop-0.21.0-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f3df876acd7ec037a3d005b3ab85a7e4110422e4d9c1571d4fc89b0fc41b6816"}, + {file = "uvloop-0.21.0-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:bd53ecc9a0f3d87ab847503c2e1552b690362e005ab54e8a48ba97da3924c0dc"}, + {file = "uvloop-0.21.0-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:a5c39f217ab3c663dc699c04cbd50c13813e31d917642d459fdcec07555cc553"}, + {file = "uvloop-0.21.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:17df489689befc72c39a08359efac29bbee8eee5209650d4b9f34df73d22e414"}, + {file = "uvloop-0.21.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:bc09f0ff191e61c2d592a752423c767b4ebb2986daa9ed62908e2b1b9a9ae206"}, + {file = "uvloop-0.21.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:f0ce1b49560b1d2d8a2977e3ba4afb2414fb46b86a1b64056bc4ab929efdafbe"}, + {file = "uvloop-0.21.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e678ad6fe52af2c58d2ae3c73dc85524ba8abe637f134bf3564ed07f555c5e79"}, + {file = "uvloop-0.21.0-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:460def4412e473896ef179a1671b40c039c7012184b627898eea5072ef6f017a"}, + {file = "uvloop-0.21.0-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:10da8046cc4a8f12c91a1c39d1dd1585c41162a15caaef165c2174db9ef18bdc"}, + {file = "uvloop-0.21.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:c097078b8031190c934ed0ebfee8cc5f9ba9642e6eb88322b9958b649750f72b"}, + {file = "uvloop-0.21.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:46923b0b5ee7fc0020bef24afe7836cb068f5050ca04caf6b487c513dc1a20b2"}, + {file = "uvloop-0.21.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:53e420a3afe22cdcf2a0f4846e377d16e718bc70103d7088a4f7623567ba5fb0"}, + {file = "uvloop-0.21.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:88cb67cdbc0e483da00af0b2c3cdad4b7c61ceb1ee0f33fe00e09c81e3a6cb75"}, + {file = "uvloop-0.21.0-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:221f4f2a1f46032b403bf3be628011caf75428ee3cc204a22addf96f586b19fd"}, + {file = "uvloop-0.21.0-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:2d1f581393673ce119355d56da84fe1dd9d2bb8b3d13ce792524e1607139feff"}, + {file = "uvloop-0.21.0.tar.gz", hash = "sha256:3bf12b0fda68447806a7ad847bfa591613177275d35b6724b1ee573faa3704e3"}, +] + +[package.extras] +dev = ["Cython (>=3.0,<4.0)", "setuptools (>=60)"] +docs = ["Sphinx (>=4.1.2,<4.2.0)", "sphinx-rtd-theme (>=0.5.2,<0.6.0)", "sphinxcontrib-asyncio (>=0.3.0,<0.4.0)"] +test = ["aiohttp (>=3.10.5)", "flake8 (>=5.0,<6.0)", "mypy (>=0.800)", "psutil", "pyOpenSSL (>=23.0.0,<23.1.0)", "pycodestyle (>=2.9.0,<2.10.0)"] + [[package]] name = "ward" version = "0.68.0b0" @@ -5885,4 +5957,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = ">=3.12,<3.13" -content-hash = "0b75ce61bf1e1338e08e99482083f7d1238216f11f9a07edfc29f61d9b620f6f" +content-hash = "6378447b12d87d1403ad5b9465b5fe7ed97b561b131d31eb52c85871da50449e" diff --git a/agents-api/pyproject.toml b/agents-api/pyproject.toml index 47c709286..910f9e53b 100644 --- a/agents-api/pyproject.toml +++ b/agents-api/pyproject.toml @@ -45,6 +45,8 @@ xxhash = "^3.5.0" spacy = "^3.8.2" en-core-web-sm = {url = "https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl"} msgpack = "^1.1.0" +gunicorn = "^23.0.0" +uvloop = "^0.21.0" [tool.poetry.group.dev.dependencies] ipython = "^8.26.0" ruff = "^0.5.5" From e9515f30bbc564f013e4365b3459cf4eb305742d Mon Sep 17 00:00:00 2001 From: HamadaSalhab Date: Wed, 23 Oct 2024 21:57:25 +0000 Subject: [PATCH 3/6] refactor: Lint agents-api (CI) --- agents-api/agents_api/web.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents-api/agents_api/web.py b/agents-api/agents_api/web.py index 54cd80425..59d7ee429 100644 --- a/agents-api/agents_api/web.py +++ b/agents-api/agents_api/web.py @@ -3,13 +3,13 @@ """ import asyncio -import uvloop import logging from typing import Any, Callable import fire import sentry_sdk import uvicorn +import uvloop from fastapi import APIRouter, Depends, FastAPI, Request, status from fastapi.exceptions import HTTPException, RequestValidationError from fastapi.middleware.cors import CORSMiddleware From 111529d3e00b7e26f5da1d42b5fcd7c0d42013b0 Mon Sep 17 00:00:00 2001 From: HamadaSalhab Date: Thu, 24 Oct 2024 17:00:38 +0300 Subject: [PATCH 4/6] Fix worker run command --- agents-api/Dockerfile.worker | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/agents-api/Dockerfile.worker b/agents-api/Dockerfile.worker index 5dc2f8254..81fa7d2cd 100644 --- a/agents-api/Dockerfile.worker +++ b/agents-api/Dockerfile.worker @@ -43,4 +43,4 @@ COPY . ./ RUN poetry install --no-dev -ENTRYPOINT ["gunicorn", "agents_api.worker:app", "-c", "gunicorn_conf.py"] +ENTRYPOINT ["python", "-m", "agents_api.worker"] From d803ebb952c3dacc95a2bc0b87ce45c370af861f Mon Sep 17 00:00:00 2001 From: Diwank Singh Tomer Date: Thu, 24 Oct 2024 16:44:50 -0400 Subject: [PATCH 5/6] fix(agents-api): Fix vector search parameters Signed-off-by: Diwank Singh Tomer --- .../models/docs/search_docs_by_embedding.py | 4 +- agents-api/agents_api/web.py | 1 - ...rate_1729114011_tweak_proximity_indices.py | 41 +++++++++++++++++++ 3 files changed, 43 insertions(+), 3 deletions(-) diff --git a/agents-api/agents_api/models/docs/search_docs_by_embedding.py b/agents-api/agents_api/models/docs/search_docs_by_embedding.py index 7d3bbbd2f..e346b6b69 100644 --- a/agents-api/agents_api/models/docs/search_docs_by_embedding.py +++ b/agents-api/agents_api/models/docs/search_docs_by_embedding.py @@ -48,8 +48,8 @@ def search_docs_by_embedding( query_embedding: list[float], k: int = 3, confidence: float = 0.5, - ef: int = 32, - mmr_lambda: float = 0.25, + ef: int = 50, + mmr_lambda: float = 0.5, embedding_size: int = 1024, ) -> tuple[list[str], dict]: """ diff --git a/agents-api/agents_api/web.py b/agents-api/agents_api/web.py index 59d7ee429..037767b9f 100644 --- a/agents-api/agents_api/web.py +++ b/agents-api/agents_api/web.py @@ -6,7 +6,6 @@ import logging from typing import Any, Callable -import fire import sentry_sdk import uvicorn import uvloop diff --git a/agents-api/migrations/migrate_1729114011_tweak_proximity_indices.py b/agents-api/migrations/migrate_1729114011_tweak_proximity_indices.py index e8fbbaa58..4852f3603 100644 --- a/agents-api/migrations/migrate_1729114011_tweak_proximity_indices.py +++ b/agents-api/migrations/migrate_1729114011_tweak_proximity_indices.py @@ -12,6 +12,45 @@ def run(client, *queries): client.run(query) +# See: https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md +drop_snippets_hnsw_index = dict( + down=""" + ::hnsw create snippets:embedding_space { + fields: [embedding], + filter: !is_null(embedding), + dim: 1024, + distance: Cosine, + m: 64, + ef_construction: 256, + extend_candidates: true, + keep_pruned_connections: false, + } + """, + up=""" + ::hnsw drop snippets:embedding_space + """, +) + + +# See: https://github.com/nmslib/hnswlib/blob/master/ALGO_PARAMS.md +snippets_hnsw_index = dict( + up=""" + ::hnsw create snippets:embedding_space { + fields: [embedding], + filter: !is_null(embedding), + dim: 1024, + distance: Cosine, + m: 64, + ef_construction: 800, + extend_candidates: false, + keep_pruned_connections: false, + } + """, + down=""" + ::hnsw drop snippets:embedding_space + """, +) + drop_snippets_lsh_index = dict( up=""" ::lsh drop snippets:lsh @@ -77,8 +116,10 @@ def run(client, *queries): ) queries_to_run = [ + drop_snippets_hnsw_index, drop_snippets_lsh_index, drop_snippets_fts_index, + snippets_hnsw_index, snippets_lsh_index, snippets_fts_index, ] From 0f4c4e0f43fb44f05e2fc522db745a20afc1c700 Mon Sep 17 00:00:00 2001 From: Diwank Singh Tomer Date: Thu, 24 Oct 2024 17:22:52 -0400 Subject: [PATCH 6/6] feat(agents-api): Improve nlp performance Signed-off-by: Diwank Singh Tomer --- agents-api/agents_api/common/nlp.py | 80 +++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 16 deletions(-) diff --git a/agents-api/agents_api/common/nlp.py b/agents-api/agents_api/common/nlp.py index 89b06a2c1..d7dcabe15 100644 --- a/agents-api/agents_api/common/nlp.py +++ b/agents-api/agents_api/common/nlp.py @@ -1,7 +1,6 @@ import re from collections import Counter, defaultdict from functools import lru_cache -from typing import Dict, List, Set, Tuple import spacy from spacy.matcher import PhraseMatcher @@ -13,15 +12,13 @@ NON_ALPHANUM_RE = re.compile(r"[^\w\s\-_]+") # Initialize spaCy with minimal pipeline -spacy.prefer_gpu() -nlp = spacy.load( - "en_core_web_sm", - disable=["lemmatizer", "textcat", "vector"], # Disable unused components -) +nlp = spacy.load("en_core_web_sm", exclude=["lemmatizer", "textcat", "tok2vec"]) -# Singleton PhraseMatcher for better performance +# Add sentencizer for faster sentence tokenization +sentencizer = nlp.add_pipe("sentencizer") +# Singleton PhraseMatcher for better performance class KeywordMatcher: _instance = None @@ -29,7 +26,7 @@ def __new__(cls): if cls._instance is None: cls._instance = super().__new__(cls) cls._instance.matcher = PhraseMatcher(nlp.vocab, attr="LOWER") - cls._instance.batch_size = 1000 + cls._instance.batch_size = 1000 # Adjust based on memory constraints cls._instance.patterns_cache = {} return cls._instance @@ -37,7 +34,7 @@ def __new__(cls): def _create_pattern(self, text: str) -> Doc: return nlp.make_doc(text) - def find_matches(self, doc: Doc, keywords: List[str]) -> Dict[str, List[int]]: + def find_matches(self, doc: Doc, keywords: list[str]) -> dict[str, list[int]]: """Batch process keywords for better performance.""" keyword_positions = defaultdict(list) @@ -71,7 +68,7 @@ def clean_keyword(kw: str) -> str: return NON_ALPHANUM_RE.sub("", kw).strip() -def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> List[str]: +def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> list[str]: """Optimized keyword extraction with minimal behavior change.""" excluded_labels = { "DATE", @@ -116,15 +113,15 @@ def extract_keywords(doc: Doc, top_n: int = 10, clean: bool = True) -> List[str] def find_proximity_groups( - keywords: List[str], keyword_positions: Dict[str, List[int]], n: int = 10 -) -> List[Set[str]]: + keywords: list[str], keyword_positions: dict[str, list[int]], n: int = 10 +) -> list[set[str]]: """Optimized proximity grouping using sorted positions.""" # Early return for single or no keywords if len(keywords) <= 1: return [{kw} for kw in keywords] # Create flat list of positions for efficient processing - positions: List[Tuple[int, str]] = [ + positions: list[tuple[int, str]] = [ (pos, kw) for kw in keywords for pos in keyword_positions[kw] ] @@ -171,7 +168,6 @@ def union(u: str, v: str) -> None: return list(groups.values()) -@lru_cache(maxsize=100) def build_query_pattern(group_size: int, n: int) -> str: """Cache query patterns for common group sizes.""" if group_size == 1: @@ -179,7 +175,7 @@ def build_query_pattern(group_size: int, n: int) -> str: return f"NEAR/{n}(" + " ".join('"{}"' for _ in range(group_size)) + ")" -def build_query(groups: List[Set[str]], n: int = 10) -> str: +def build_query(groups: list[set[str]], n: int = 10) -> str: """Build query with cached patterns.""" clauses = [] @@ -197,12 +193,22 @@ def build_query(groups: List[Set[str]], n: int = 10) -> str: return " OR ".join(clauses) +@lru_cache(maxsize=100) def paragraph_to_custom_queries( paragraph: str, top_n: int = 10, proximity_n: int = 10, min_keywords: int = 1 -) -> List[str]: +) -> list[str]: """ Optimized paragraph processing with minimal behavior changes. Added min_keywords parameter to filter out low-value queries. + + Args: + paragraph (str): The input paragraph to convert. + top_n (int): Number of top keywords to extract per sentence. + proximity_n (int): The proximity window for NEAR/n. + min_keywords (int): Minimum number of keywords required to form a query. + + Returns: + list[str]: The list of custom query strings. """ if not paragraph or not paragraph.strip(): return [] @@ -236,3 +242,45 @@ def paragraph_to_custom_queries( queries.append(query) return queries + + +def batch_paragraphs_to_custom_queries( + paragraphs: list[str], + top_n: int = 10, + proximity_n: int = 10, + min_keywords: int = 1, + n_process: int = 1, +) -> list[list[str]]: + """ + Processes multiple paragraphs using nlp.pipe for better performance. + + Args: + paragraphs (list[str]): list of paragraphs to process. + top_n (int): Number of top keywords to extract per sentence. + proximity_n (int): The proximity window for NEAR/n. + min_keywords (int): Minimum number of keywords required to form a query. + n_process (int): Number of processes to use for multiprocessing. + + Returns: + list[list[str]]: A list where each element is a list of queries for a paragraph. + """ + results = [] + for doc in nlp.pipe( + paragraphs, disable=["lemmatizer", "textcat"], n_process=n_process + ): + queries = [] + for sent in doc.sents: + sent_doc = sent.as_doc() + keywords = extract_keywords(sent_doc, top_n) + if len(keywords) < min_keywords: + continue + keyword_positions = keyword_matcher.find_matches(sent_doc, keywords) + if not keyword_positions: + continue + groups = find_proximity_groups(keywords, keyword_positions, proximity_n) + query = build_query(groups, proximity_n) + if query: + queries.append(query) + results.append(queries) + + return results