GraphRAG: Better search for entities (#2834)

nuclia · Feb 4, 2025 · 9f994b6 · 9f994b6
1 parent 6dde36b
commit 9f994b6
Show file tree

Hide file tree

Showing 10 changed files with 259 additions and 138 deletions.
diff --git a/nidx/nidx_relation/src/lib.rs b/nidx/nidx_relation/src/lib.rs
@@ -24,8 +24,8 @@ mod resource_indexer;
 mod schema;
 
 use nidx_protos::{
-    relation_node::NodeType, resource::ResourceStatus, RelationNode, RelationNodeFilter, RelationPrefixSearchRequest,
-    RelationSearchRequest, RelationSearchResponse,
+    relation_node::NodeType, relation_prefix_search_request::Search, resource::ResourceStatus, RelationNode,
+    RelationNodeFilter, RelationPrefixSearchRequest, RelationSearchRequest, RelationSearchResponse,
 };
 use nidx_tantivy::{
     index_reader::{open_index_with_deletions, DeletionQueryBuilder},
@@ -140,7 +140,7 @@ impl RelationSearcher {
             prefixes.iter().filter(|prefix| prefix.len() >= MIN_SUGGEST_PREFIX_LENGTH).cloned().map(|prefix| {
                 RelationSearchRequest {
                     prefix: Some(RelationPrefixSearchRequest {
-                        prefix,
+                        search: Some(Search::Prefix(prefix)),
                         node_filters: vec![RelationNodeFilter {
                             node_type: NodeType::Entity.into(),
                             ..Default::default()

diff --git a/nidx/nidx_relation/src/reader.rs b/nidx/nidx_relation/src/reader.rs
@@ -21,18 +21,21 @@ use std::collections::HashSet;
 use std::fmt::Debug;
 use std::path::Path;
 
+use nidx_protos::relation_prefix_search_request::Search;
 use nidx_protos::{
     EntitiesSubgraphResponse, RelationNode, RelationPrefixSearchResponse, RelationSearchRequest, RelationSearchResponse,
 };
 use tantivy::collector::TopDocs;
-use tantivy::query::{AllQuery, BooleanQuery, FuzzyTermQuery, Occur, Query, TermQuery};
+use tantivy::query::{BooleanQuery, FuzzyTermQuery, Occur, Query, TermQuery};
 use tantivy::schema::IndexRecordOption;
 use tantivy::{Index, IndexReader, Term};
 
 use crate::schema::Schema;
 use crate::{io_maps, schema};
 
 const FUZZY_DISTANCE: u8 = 1;
+// Search for entities of these many words of length
+const ENTITY_WORD_SIZE: usize = 3;
 const NUMBER_OF_RESULTS_SUGGEST: usize = 10;
 // Hard limit until we have pagination in place
 const MAX_NUM_RELATIONS_RESULTS: usize = 500;
@@ -216,11 +219,14 @@ impl RelationsReaderService {
             return Ok(None);
         };
 
+        let Some(search) = &prefix_request.search else {
+            return Err(anyhow::anyhow!("Search terms needed"));
+        };
+
         // if prefix_request.prefix.is_empty() {
         //     return Ok(Some(RelationPrefixSearchResponse::default()));
         // }
 
-        let prefix = schema::normalize(&prefix_request.prefix);
         let searcher = self.reader.searcher();
         let topdocs = TopDocs::with_limit(NUMBER_OF_RESULTS_SUGGEST);
         let schema = &self.schema;
@@ -260,33 +266,66 @@ impl RelationsReaderService {
             target_types.push((Occur::Should, target_clause));
         }
 
-        let source_typing_query: Box<dyn Query> = if source_types.is_empty() {
-            Box::new(AllQuery)
-        } else {
-            Box::new(BooleanQuery::new(source_types))
+        let mut source_q: Vec<(Occur, Box<dyn Query>)> = Vec::new();
+        let mut target_q: Vec<(Occur, Box<dyn Query>)> = Vec::new();
+
+        if !source_types.is_empty() {
+            source_q.push((Occur::Must, Box::new(BooleanQuery::new(source_types))));
         };
 
-        let target_typing_query: Box<dyn Query> = if target_types.is_empty() {
-            Box::new(AllQuery)
-        } else {
-            Box::new(BooleanQuery::new(target_types))
+        if !target_types.is_empty() {
+            target_q.push((Occur::Must, Box::new(BooleanQuery::new(target_types))));
         };
 
-        let source_value_query: Box<dyn Query> = Box::new(FuzzyTermQuery::new_prefix(
-            Term::from_field_text(schema.normalized_source_value, &prefix),
-            FUZZY_DISTANCE,
-            true,
-        ));
-        let target_value_query: Box<dyn Query> = Box::new(FuzzyTermQuery::new_prefix(
-            Term::from_field_text(schema.normalized_target_value, &prefix),
-            FUZZY_DISTANCE,
-            true,
-        ));
+        match search {
+            Search::Query(query) => {
+                // This search is intended to do a normal tokenized search on entities names. However, since we
+                // do some custom normalization for these fields, we need to do some custom handling here.
+                // Feel free to replace this with something better if we start indexing entities name with tokenization.
+                let mut source_prefix_q = Vec::new();
+                let mut target_prefix_q = Vec::new();
+                // Search for all groups of words in the query, e.g:
+                // query "Films with James Bond"
+                // returns:
+                // "Films", "with", "James", "Bond"
+                // "Films with", "with James", "James Bond"
+                // "Films with James", "with James Bond"
+                let words: Vec<_> = query.split_whitespace().collect();
+                for end in 1..=words.len() {
+                    for len in 1..=ENTITY_WORD_SIZE {
+                        if len > end {
+                            break;
+                        }
+                        let start = end - len;
+                        self.add_fuzzy_prefix_query(&mut source_prefix_q, &mut target_prefix_q, &words[start..end]);
+                    }
+                }
+                source_q.push((Occur::Must, Box::new(BooleanQuery::new(source_prefix_q))));
+                target_q.push((Occur::Must, Box::new(BooleanQuery::new(target_prefix_q))));
+            }
+            Search::Prefix(prefix) => {
+                let normalized_prefix = schema::normalize(prefix);
+                source_q.push((
+                    Occur::Must,
+                    Box::new(FuzzyTermQuery::new_prefix(
+                        Term::from_field_text(self.schema.normalized_source_value, &normalized_prefix),
+                        FUZZY_DISTANCE,
+                        true,
+                    )),
+                ));
+                target_q.push((
+                    Occur::Must,
+                    Box::new(FuzzyTermQuery::new_prefix(
+                        Term::from_field_text(self.schema.normalized_target_value, &normalized_prefix),
+                        FUZZY_DISTANCE,
+                        true,
+                    )),
+                ));
+            }
+        }
 
-        let source_prefix_query =
-            BooleanQuery::new(vec![(Occur::Must, source_value_query), (Occur::Must, source_typing_query)]);
-        let target_prefix_query =
-            BooleanQuery::new(vec![(Occur::Must, target_value_query), (Occur::Must, target_typing_query)]);
+        let source_prefix_query = BooleanQuery::new(source_q);
+        let target_prefix_query = BooleanQuery::new(target_q);
 
         let mut response = RelationPrefixSearchResponse::default();
         let mut results = HashSet::new();
@@ -303,6 +342,31 @@ impl RelationsReaderService {
         response.nodes = results.into_iter().map(Into::into).collect();
         Ok(Some(response))
     }
+
+    fn add_fuzzy_prefix_query(
+        &self,
+        source_queries: &mut Vec<(Occur, Box<dyn Query>)>,
+        target_queries: &mut Vec<(Occur, Box<dyn Query>)>,
+        prefix: &[&str],
+    ) {
+        let normalized_prefix = schema::normalize_words(prefix.iter().copied());
+        source_queries.push((
+            Occur::Should,
+            Box::new(FuzzyTermQuery::new(
+                Term::from_field_text(self.schema.normalized_source_value, &normalized_prefix),
+                FUZZY_DISTANCE,
+                true,
+            )),
+        ));
+        target_queries.push((
+            Occur::Should,
+            Box::new(FuzzyTermQuery::new(
+                Term::from_field_text(self.schema.normalized_target_value, &normalized_prefix),
+                FUZZY_DISTANCE,
+                true,
+            )),
+        ));
+    }
 }
 
 pub struct HashedRelationNode(pub RelationNode);

diff --git a/nidx/nidx_relation/src/schema.rs b/nidx/nidx_relation/src/schema.rs
@@ -24,8 +24,12 @@ use tantivy::TantivyDocument;
 
 /// The source will be deunicoded, se
 pub fn normalize(source: &str) -> String {
+    normalize_words(source.split_whitespace())
+}
+
+pub fn normalize_words<'a>(source: impl Iterator<Item = &'a str>) -> String {
     let mut normalized = String::new();
-    for segment in source.split_whitespace() {
+    for segment in source {
         let deunicoded = deunicode::deunicode(segment);
         let ascii_lower_cased = deunicoded.to_ascii_lowercase();
         normalized.push_str(&ascii_lower_cased);

diff --git a/nidx/nidx_relation/tests/test_reader.rs b/nidx/nidx_relation/tests/test_reader.rs
@@ -22,6 +22,7 @@ mod common;
 use nidx_protos::entities_subgraph_request::DeletedEntities;
 use nidx_protos::relation::RelationType;
 use nidx_protos::relation_node::NodeType;
+use nidx_protos::relation_prefix_search_request::Search;
 use nidx_protos::{
     EntitiesSubgraphRequest, RelationMetadata, RelationNodeFilter, RelationPrefixSearchRequest, RelationSearchRequest,
     Resource, ResourceId,
@@ -124,7 +125,6 @@ fn create_reader() -> anyhow::Result<RelationSearcher> {
                     to_start: Some(11),
                     to_end: Some(20),
                     data_augmentation_task_id: Some("mytask".to_string()),
-                    ..Default::default()
                 },
             ),
             common::create_relation(
@@ -136,6 +136,15 @@ fn create_reader() -> anyhow::Result<RelationSearcher> {
                 "PLACES".to_string(),
                 RelationType::Entity,
             ),
+            common::create_relation(
+                "James Bond".to_string(),
+                NodeType::Entity,
+                "PEOPLE".to_string(),
+                "Ian Fleming".to_string(),
+                NodeType::Entity,
+                "PEOPLE".to_string(),
+                RelationType::Entity,
+            ),
         ],
         ..Default::default()
     };
@@ -250,17 +259,17 @@ fn test_prefix_search() -> anyhow::Result<()> {
 
     let result = reader.search(&RelationSearchRequest {
         prefix: Some(RelationPrefixSearchRequest {
-            prefix: "".to_string(),
+            search: Some(Search::Prefix("".to_string())),
             ..Default::default()
         }),
         ..Default::default()
     })?;
 
-    assert_eq!(result.prefix.unwrap().nodes.len(), 10);
+    assert_eq!(result.prefix.unwrap().nodes.len(), 12);
 
     let result = reader.search(&RelationSearchRequest {
         prefix: Some(RelationPrefixSearchRequest {
-            prefix: "do".to_string(),
+            search: Some(Search::Prefix("do".to_string())),
             ..Default::default()
         }),
         ..Default::default()
@@ -270,7 +279,7 @@ fn test_prefix_search() -> anyhow::Result<()> {
 
     let result = reader.search(&RelationSearchRequest {
         prefix: Some(RelationPrefixSearchRequest {
-            prefix: "ann".to_string(),
+            search: Some(Search::Prefix("ann".to_string())),
             ..Default::default()
         }),
         ..Default::default()
@@ -280,13 +289,56 @@ fn test_prefix_search() -> anyhow::Result<()> {
     Ok(())
 }
 
+#[test]
+fn test_prefix_query_search() -> anyhow::Result<()> {
+    let reader = create_reader()?;
+
+    let result = reader.search(&RelationSearchRequest {
+        prefix: Some(RelationPrefixSearchRequest {
+            search: Some(Search::Query("Films with James Bond played by Roger Moore".to_string())),
+            ..Default::default()
+        }),
+        ..Default::default()
+    })?;
+    assert_eq!(result.prefix.unwrap().nodes.len(), 1);
+
+    let result = reader.search(&RelationSearchRequest {
+        prefix: Some(RelationPrefixSearchRequest {
+            search: Some(Search::Query("Films with Jomes Bond played by Roger Moore".to_string())),
+            ..Default::default()
+        }),
+        ..Default::default()
+    })?;
+    assert_eq!(result.prefix.unwrap().nodes.len(), 1);
+
+    let result = reader.search(&RelationSearchRequest {
+        prefix: Some(RelationPrefixSearchRequest {
+            search: Some(Search::Query("Just James".to_string())),
+            ..Default::default()
+        }),
+        ..Default::default()
+    })?;
+    assert_eq!(result.prefix.unwrap().nodes.len(), 0);
+
+    let result = reader.search(&RelationSearchRequest {
+        prefix: Some(RelationPrefixSearchRequest {
+            search: Some(Search::Query("James Bond or Anastasia".to_string())),
+            ..Default::default()
+        }),
+        ..Default::default()
+    })?;
+    assert_eq!(result.prefix.unwrap().nodes.len(), 2);
+
+    Ok(())
+}
+
 #[test]
 fn test_prefix_search_with_filters() -> anyhow::Result<()> {
     let reader = create_reader()?;
 
     let result = reader.search(&RelationSearchRequest {
         prefix: Some(RelationPrefixSearchRequest {
-            prefix: "".to_string(),
+            search: Some(Search::Prefix("".to_string())),
             node_filters: vec![RelationNodeFilter {
                 node_type: NodeType::Entity as i32,
                 node_subtype: Some("ANIMALS".to_string()),
@@ -299,7 +351,7 @@ fn test_prefix_search_with_filters() -> anyhow::Result<()> {
 
     let result = reader.search(&RelationSearchRequest {
         prefix: Some(RelationPrefixSearchRequest {
-            prefix: "".to_string(),
+            search: Some(Search::Prefix("".to_string())),
             node_filters: vec![RelationNodeFilter {
                 node_type: NodeType::Resource as i32,
                 node_subtype: None,
@@ -312,7 +364,7 @@ fn test_prefix_search_with_filters() -> anyhow::Result<()> {
 
     let result = reader.search(&RelationSearchRequest {
         prefix: Some(RelationPrefixSearchRequest {
-            prefix: "".to_string(),
+            search: Some(Search::Prefix("".to_string())),
             node_filters: vec![RelationNodeFilter {
                 node_type: NodeType::Resource as i32,
                 node_subtype: Some("foobarmissing".to_string()),

diff --git a/nidx/nidx_relation/tests/test_writer.rs b/nidx/nidx_relation/tests/test_writer.rs
@@ -57,7 +57,6 @@ fn test_index_docs() -> anyhow::Result<()> {
                     to_start: Some(11),
                     to_end: Some(20),
                     data_augmentation_task_id: Some("mytask".to_string()),
-                    ..Default::default()
                 },
             ),
         ],