Skip to content

Commit

Permalink
GraphRAG: Better search for entities (#2834)
Browse files Browse the repository at this point in the history
  • Loading branch information
javitonino authored Feb 4, 2025
1 parent 6dde36b commit 9f994b6
Show file tree
Hide file tree
Showing 10 changed files with 259 additions and 138 deletions.
6 changes: 3 additions & 3 deletions nidx/nidx_relation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,8 @@ mod resource_indexer;
mod schema;

use nidx_protos::{
relation_node::NodeType, resource::ResourceStatus, RelationNode, RelationNodeFilter, RelationPrefixSearchRequest,
RelationSearchRequest, RelationSearchResponse,
relation_node::NodeType, relation_prefix_search_request::Search, resource::ResourceStatus, RelationNode,
RelationNodeFilter, RelationPrefixSearchRequest, RelationSearchRequest, RelationSearchResponse,
};
use nidx_tantivy::{
index_reader::{open_index_with_deletions, DeletionQueryBuilder},
Expand Down Expand Up @@ -140,7 +140,7 @@ impl RelationSearcher {
prefixes.iter().filter(|prefix| prefix.len() >= MIN_SUGGEST_PREFIX_LENGTH).cloned().map(|prefix| {
RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
prefix,
search: Some(Search::Prefix(prefix)),
node_filters: vec![RelationNodeFilter {
node_type: NodeType::Entity.into(),
..Default::default()
Expand Down
112 changes: 88 additions & 24 deletions nidx/nidx_relation/src/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,21 @@ use std::collections::HashSet;
use std::fmt::Debug;
use std::path::Path;

use nidx_protos::relation_prefix_search_request::Search;
use nidx_protos::{
EntitiesSubgraphResponse, RelationNode, RelationPrefixSearchResponse, RelationSearchRequest, RelationSearchResponse,
};
use tantivy::collector::TopDocs;
use tantivy::query::{AllQuery, BooleanQuery, FuzzyTermQuery, Occur, Query, TermQuery};
use tantivy::query::{BooleanQuery, FuzzyTermQuery, Occur, Query, TermQuery};
use tantivy::schema::IndexRecordOption;
use tantivy::{Index, IndexReader, Term};

use crate::schema::Schema;
use crate::{io_maps, schema};

const FUZZY_DISTANCE: u8 = 1;
// Search for entities of these many words of length
const ENTITY_WORD_SIZE: usize = 3;
const NUMBER_OF_RESULTS_SUGGEST: usize = 10;
// Hard limit until we have pagination in place
const MAX_NUM_RELATIONS_RESULTS: usize = 500;
Expand Down Expand Up @@ -216,11 +219,14 @@ impl RelationsReaderService {
return Ok(None);
};

let Some(search) = &prefix_request.search else {
return Err(anyhow::anyhow!("Search terms needed"));
};

// if prefix_request.prefix.is_empty() {
// return Ok(Some(RelationPrefixSearchResponse::default()));
// }

let prefix = schema::normalize(&prefix_request.prefix);
let searcher = self.reader.searcher();
let topdocs = TopDocs::with_limit(NUMBER_OF_RESULTS_SUGGEST);
let schema = &self.schema;
Expand Down Expand Up @@ -260,33 +266,66 @@ impl RelationsReaderService {
target_types.push((Occur::Should, target_clause));
}

let source_typing_query: Box<dyn Query> = if source_types.is_empty() {
Box::new(AllQuery)
} else {
Box::new(BooleanQuery::new(source_types))
let mut source_q: Vec<(Occur, Box<dyn Query>)> = Vec::new();
let mut target_q: Vec<(Occur, Box<dyn Query>)> = Vec::new();

if !source_types.is_empty() {
source_q.push((Occur::Must, Box::new(BooleanQuery::new(source_types))));
};

let target_typing_query: Box<dyn Query> = if target_types.is_empty() {
Box::new(AllQuery)
} else {
Box::new(BooleanQuery::new(target_types))
if !target_types.is_empty() {
target_q.push((Occur::Must, Box::new(BooleanQuery::new(target_types))));
};

let source_value_query: Box<dyn Query> = Box::new(FuzzyTermQuery::new_prefix(
Term::from_field_text(schema.normalized_source_value, &prefix),
FUZZY_DISTANCE,
true,
));
let target_value_query: Box<dyn Query> = Box::new(FuzzyTermQuery::new_prefix(
Term::from_field_text(schema.normalized_target_value, &prefix),
FUZZY_DISTANCE,
true,
));
match search {
Search::Query(query) => {
// This search is intended to do a normal tokenized search on entities names. However, since we
// do some custom normalization for these fields, we need to do some custom handling here.
// Feel free to replace this with something better if we start indexing entities name with tokenization.
let mut source_prefix_q = Vec::new();
let mut target_prefix_q = Vec::new();
// Search for all groups of words in the query, e.g:
// query "Films with James Bond"
// returns:
// "Films", "with", "James", "Bond"
// "Films with", "with James", "James Bond"
// "Films with James", "with James Bond"
let words: Vec<_> = query.split_whitespace().collect();
for end in 1..=words.len() {
for len in 1..=ENTITY_WORD_SIZE {
if len > end {
break;
}
let start = end - len;
self.add_fuzzy_prefix_query(&mut source_prefix_q, &mut target_prefix_q, &words[start..end]);
}
}
source_q.push((Occur::Must, Box::new(BooleanQuery::new(source_prefix_q))));
target_q.push((Occur::Must, Box::new(BooleanQuery::new(target_prefix_q))));
}
Search::Prefix(prefix) => {
let normalized_prefix = schema::normalize(prefix);
source_q.push((
Occur::Must,
Box::new(FuzzyTermQuery::new_prefix(
Term::from_field_text(self.schema.normalized_source_value, &normalized_prefix),
FUZZY_DISTANCE,
true,
)),
));
target_q.push((
Occur::Must,
Box::new(FuzzyTermQuery::new_prefix(
Term::from_field_text(self.schema.normalized_target_value, &normalized_prefix),
FUZZY_DISTANCE,
true,
)),
));
}
}

let source_prefix_query =
BooleanQuery::new(vec![(Occur::Must, source_value_query), (Occur::Must, source_typing_query)]);
let target_prefix_query =
BooleanQuery::new(vec![(Occur::Must, target_value_query), (Occur::Must, target_typing_query)]);
let source_prefix_query = BooleanQuery::new(source_q);
let target_prefix_query = BooleanQuery::new(target_q);

let mut response = RelationPrefixSearchResponse::default();
let mut results = HashSet::new();
Expand All @@ -303,6 +342,31 @@ impl RelationsReaderService {
response.nodes = results.into_iter().map(Into::into).collect();
Ok(Some(response))
}

fn add_fuzzy_prefix_query(
&self,
source_queries: &mut Vec<(Occur, Box<dyn Query>)>,
target_queries: &mut Vec<(Occur, Box<dyn Query>)>,
prefix: &[&str],
) {
let normalized_prefix = schema::normalize_words(prefix.iter().copied());
source_queries.push((
Occur::Should,
Box::new(FuzzyTermQuery::new(
Term::from_field_text(self.schema.normalized_source_value, &normalized_prefix),
FUZZY_DISTANCE,
true,
)),
));
target_queries.push((
Occur::Should,
Box::new(FuzzyTermQuery::new(
Term::from_field_text(self.schema.normalized_target_value, &normalized_prefix),
FUZZY_DISTANCE,
true,
)),
));
}
}

pub struct HashedRelationNode(pub RelationNode);
Expand Down
6 changes: 5 additions & 1 deletion nidx/nidx_relation/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,12 @@ use tantivy::TantivyDocument;

/// The source will be deunicoded, se
pub fn normalize(source: &str) -> String {
normalize_words(source.split_whitespace())
}

pub fn normalize_words<'a>(source: impl Iterator<Item = &'a str>) -> String {
let mut normalized = String::new();
for segment in source.split_whitespace() {
for segment in source {
let deunicoded = deunicode::deunicode(segment);
let ascii_lower_cased = deunicoded.to_ascii_lowercase();
normalized.push_str(&ascii_lower_cased);
Expand Down
68 changes: 60 additions & 8 deletions nidx/nidx_relation/tests/test_reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ mod common;
use nidx_protos::entities_subgraph_request::DeletedEntities;
use nidx_protos::relation::RelationType;
use nidx_protos::relation_node::NodeType;
use nidx_protos::relation_prefix_search_request::Search;
use nidx_protos::{
EntitiesSubgraphRequest, RelationMetadata, RelationNodeFilter, RelationPrefixSearchRequest, RelationSearchRequest,
Resource, ResourceId,
Expand Down Expand Up @@ -124,7 +125,6 @@ fn create_reader() -> anyhow::Result<RelationSearcher> {
to_start: Some(11),
to_end: Some(20),
data_augmentation_task_id: Some("mytask".to_string()),
..Default::default()
},
),
common::create_relation(
Expand All @@ -136,6 +136,15 @@ fn create_reader() -> anyhow::Result<RelationSearcher> {
"PLACES".to_string(),
RelationType::Entity,
),
common::create_relation(
"James Bond".to_string(),
NodeType::Entity,
"PEOPLE".to_string(),
"Ian Fleming".to_string(),
NodeType::Entity,
"PEOPLE".to_string(),
RelationType::Entity,
),
],
..Default::default()
};
Expand Down Expand Up @@ -250,17 +259,17 @@ fn test_prefix_search() -> anyhow::Result<()> {

let result = reader.search(&RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
prefix: "".to_string(),
search: Some(Search::Prefix("".to_string())),
..Default::default()
}),
..Default::default()
})?;

assert_eq!(result.prefix.unwrap().nodes.len(), 10);
assert_eq!(result.prefix.unwrap().nodes.len(), 12);

let result = reader.search(&RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
prefix: "do".to_string(),
search: Some(Search::Prefix("do".to_string())),
..Default::default()
}),
..Default::default()
Expand All @@ -270,7 +279,7 @@ fn test_prefix_search() -> anyhow::Result<()> {

let result = reader.search(&RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
prefix: "ann".to_string(),
search: Some(Search::Prefix("ann".to_string())),
..Default::default()
}),
..Default::default()
Expand All @@ -280,13 +289,56 @@ fn test_prefix_search() -> anyhow::Result<()> {
Ok(())
}

#[test]
fn test_prefix_query_search() -> anyhow::Result<()> {
let reader = create_reader()?;

let result = reader.search(&RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
search: Some(Search::Query("Films with James Bond played by Roger Moore".to_string())),
..Default::default()
}),
..Default::default()
})?;
assert_eq!(result.prefix.unwrap().nodes.len(), 1);

let result = reader.search(&RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
search: Some(Search::Query("Films with Jomes Bond played by Roger Moore".to_string())),
..Default::default()
}),
..Default::default()
})?;
assert_eq!(result.prefix.unwrap().nodes.len(), 1);

let result = reader.search(&RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
search: Some(Search::Query("Just James".to_string())),
..Default::default()
}),
..Default::default()
})?;
assert_eq!(result.prefix.unwrap().nodes.len(), 0);

let result = reader.search(&RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
search: Some(Search::Query("James Bond or Anastasia".to_string())),
..Default::default()
}),
..Default::default()
})?;
assert_eq!(result.prefix.unwrap().nodes.len(), 2);

Ok(())
}

#[test]
fn test_prefix_search_with_filters() -> anyhow::Result<()> {
let reader = create_reader()?;

let result = reader.search(&RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
prefix: "".to_string(),
search: Some(Search::Prefix("".to_string())),
node_filters: vec![RelationNodeFilter {
node_type: NodeType::Entity as i32,
node_subtype: Some("ANIMALS".to_string()),
Expand All @@ -299,7 +351,7 @@ fn test_prefix_search_with_filters() -> anyhow::Result<()> {

let result = reader.search(&RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
prefix: "".to_string(),
search: Some(Search::Prefix("".to_string())),
node_filters: vec![RelationNodeFilter {
node_type: NodeType::Resource as i32,
node_subtype: None,
Expand All @@ -312,7 +364,7 @@ fn test_prefix_search_with_filters() -> anyhow::Result<()> {

let result = reader.search(&RelationSearchRequest {
prefix: Some(RelationPrefixSearchRequest {
prefix: "".to_string(),
search: Some(Search::Prefix("".to_string())),
node_filters: vec![RelationNodeFilter {
node_type: NodeType::Resource as i32,
node_subtype: Some("foobarmissing".to_string()),
Expand Down
1 change: 0 additions & 1 deletion nidx/nidx_relation/tests/test_writer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ fn test_index_docs() -> anyhow::Result<()> {
to_start: Some(11),
to_end: Some(20),
data_augmentation_task_id: Some("mytask".to_string()),
..Default::default()
},
),
],
Expand Down
Loading

0 comments on commit 9f994b6

Please sign in to comment.