Skip to content

Commit

Permalink
Add doc_id_mapping to IVF
Browse files Browse the repository at this point in the history
  • Loading branch information
BuildKite committed Dec 2, 2024
1 parent 643c6e6 commit ff23bd9
Show file tree
Hide file tree
Showing 6 changed files with 218 additions and 13 deletions.
23 changes: 23 additions & 0 deletions rs/index/src/ivf/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ pub struct IvfBuilder {
vectors: Box<dyn VectorStorage<f32>>,
centroids: Box<dyn VectorStorage<f32>>,
posting_lists: Box<dyn for<'a> PostingListStorage<'a>>,
doc_id_mapping: Box<dyn VectorStorage<u64>>,
}

impl IvfBuilder {
Expand Down Expand Up @@ -70,11 +71,23 @@ impl IvfBuilder {
config.num_clusters,
));

let doc_id_mapping_path =
format!("{}/builder_doc_id_mapping_storage", config.base_directory);
create_dir(&doc_id_mapping_path)?;

let doc_id_mapping = Box::new(FileBackedAppendableVectorStorage::<u64>::new(
doc_id_mapping_path,
config.memory_size,
config.file_size,
1,
));

Ok(Self {
config,
vectors,
centroids,
posting_lists,
doc_id_mapping,
})
}

Expand All @@ -86,6 +99,10 @@ impl IvfBuilder {
&*self.vectors
}

pub fn doc_id_mapping(&self) -> &dyn VectorStorage<u64> {
&*self.doc_id_mapping
}

pub fn centroids(&self) -> &dyn VectorStorage<f32> {
&*self.centroids
}
Expand All @@ -112,6 +129,12 @@ impl IvfBuilder {
Ok(())
}

pub fn generate_id(&mut self, doc_id: u64) -> Result<u32> {
let generated_id = self.doc_id_mapping.len() as u32;
self.doc_id_mapping.append(std::slice::from_ref(&doc_id))?;
Ok(generated_id)
}

fn find_nearest_centroids(
vector: &[f32],
centroids: &dyn VectorStorage<f32>,
Expand Down
63 changes: 56 additions & 7 deletions rs/index/src/ivf/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,10 +47,16 @@ impl Ivf {
.get_centroid(i as usize)
.with_context(|| format!("Failed to get centroid at index {}", i))?;
let dist = L2DistanceCalculator::calculate(&vector, &centroid);
println!("TYB comparing {:?} {:?} {}", vector, centroid, dist);
distances.push((i as usize, dist));
}
distances.select_nth_unstable_by(num_probes - 1, |a, b| a.1.total_cmp(&b.1));
Ok(distances.into_iter().map(|(idx, _)| idx).collect())
println!("TYB {:?}", distances);
let mut nearest_centroids: Vec<(usize, f32)> =
distances.into_iter().take(num_probes).collect();
nearest_centroids.sort_by(|a, b| a.1.total_cmp(&b.1));
println!("TYB {:?}", nearest_centroids);
Ok(nearest_centroids.into_iter().map(|(idx, _)| idx).collect())
}
}

Expand Down Expand Up @@ -134,11 +140,13 @@ mod tests {

fn create_fixed_file_index_storage(
file_path: &String,
doc_id_mapping: &Vec<u64>,
centroids: &Vec<Vec<f32>>,
posting_lists: &Vec<Vec<u64>>,
) -> Result<usize> {
let mut file = File::create(file_path.clone())?;

let num_vectors = doc_id_mapping.len();
let num_clusters = centroids.len();
if num_clusters != posting_lists.len() {
return Err(anyhow!(
Expand All @@ -149,6 +157,7 @@ mod tests {
}

// Create a test header
let doc_id_mapping_len = size_of::<u64>() * (num_vectors + 1);
let num_features = centroids[0].len();
let centroids_len = size_of::<u64>() + num_features * num_clusters * size_of::<f32>();

Expand All @@ -158,7 +167,11 @@ mod tests {
offset += size_of::<u32>();
assert!(file.write_all(&(num_clusters as u32).to_le_bytes()).is_ok());
offset += size_of::<u32>();
assert!(file.write_all(&7u64.to_le_bytes()).is_ok());
assert!(file.write_all(&(num_vectors as u64).to_le_bytes()).is_ok());
offset += size_of::<u64>();
assert!(file
.write_all(&(doc_id_mapping_len as u64).to_le_bytes())
.is_ok());
offset += size_of::<u64>();
assert!(file
.write_all(&(centroids_len as u64).to_le_bytes())
Expand All @@ -175,6 +188,14 @@ mod tests {
assert!(file.write_all(&pad).is_ok());
offset += pad.len();

// Write doc_id_mapping
assert!(file.write_all(&(num_vectors as u64).to_le_bytes()).is_ok());
offset += size_of::<u64>();
for doc_id in doc_id_mapping.iter() {
assert!(file.write_all(&(*doc_id as u64).to_le_bytes()).is_ok());
offset += size_of::<u64>();
}

// Write centroids
assert!(file.write_all(&(num_clusters as u64).to_le_bytes()).is_ok());
offset += size_of::<u64>();
Expand Down Expand Up @@ -230,9 +251,16 @@ mod tests {
.expect("FixedFileVectorStorage should be created");

let file_path = format!("{}/index", base_dir);
let doc_id_mapping = vec![100, 101, 102];
let centroids = vec![vec![1.5, 2.5, 3.5], vec![5.5, 6.5, 7.5]];
let posting_lists = vec![vec![0], vec![1, 2]];
assert!(create_fixed_file_index_storage(&file_path, &centroids, &posting_lists).is_ok());
assert!(create_fixed_file_index_storage(
&file_path,
&doc_id_mapping,
&centroids,
&posting_lists
)
.is_ok());
let index_storage =
FixedIndexFile::new(file_path).expect("FixedIndexFile should be created");

Expand All @@ -256,15 +284,22 @@ mod tests {
.to_str()
.expect("Failed to convert temporary directory path to string")
.to_string();
let file_path = format!("{}/centroids", base_dir);
let file_path = format!("{}/index", base_dir);
let vector = vec![3.0, 4.0, 5.0];
let doc_id_mapping = vec![100, 101, 102];
let centroids = vec![
vec![1.0, 2.0, 3.0],
vec![4.0, 5.0, 6.0],
vec![7.0, 8.0, 9.0],
];
let posting_lists = vec![vec![0], vec![1], vec![2]];
assert!(create_fixed_file_index_storage(&file_path, &centroids, &posting_lists).is_ok());
assert!(create_fixed_file_index_storage(
&file_path,
&doc_id_mapping,
&centroids,
&posting_lists
)
.is_ok());
let index_storage =
FixedIndexFile::new(file_path).expect("FixedIndexFile should be created");
let num_probes = 2;
Expand Down Expand Up @@ -298,9 +333,16 @@ mod tests {
.expect("FixedFileVectorStorage should be created");

let file_path = format!("{}/index", base_dir);
let doc_id_mapping = vec![100, 101, 102, 103];
let centroids = vec![vec![1.5, 2.5, 3.5], vec![5.5, 6.5, 7.5]];
let posting_lists = vec![vec![0, 3], vec![1, 2]];
assert!(create_fixed_file_index_storage(&file_path, &centroids, &posting_lists).is_ok());
assert!(create_fixed_file_index_storage(
&file_path,
&doc_id_mapping,
&centroids,
&posting_lists
)
.is_ok());
let index_storage =
FixedIndexFile::new(file_path).expect("FixedIndexFile should be created");

Expand Down Expand Up @@ -340,9 +382,16 @@ mod tests {
.expect("FixedFileVectorStorage should be created");

let file_path = format!("{}/index", base_dir);
let doc_id_mapping = vec![100];
let centroids = vec![vec![100.0, 200.0, 300.0]];
let posting_lists = vec![vec![0]];
assert!(create_fixed_file_index_storage(&file_path, &centroids, &posting_lists).is_ok());
assert!(create_fixed_file_index_storage(
&file_path,
&doc_id_mapping,
&centroids,
&posting_lists
)
.is_ok());
let index_storage =
FixedIndexFile::new(file_path).expect("FixedIndexFile should be created");

Expand Down
18 changes: 17 additions & 1 deletion rs/index/src/ivf/reader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -68,10 +68,13 @@ mod tests {
})
.expect("Failed to create builder");
// Generate 1000 vectors of f32, dimension 4
for _ in 0..num_vectors {
for i in 0..num_vectors {
builder
.add_vector(generate_random_vector(num_features))
.expect("Vector should be added");
builder
.generate_id((i + 100) as u64)
.expect("Id should be generated");
}

assert!(builder.build().is_ok());
Expand Down Expand Up @@ -118,6 +121,19 @@ mod tests {
index.index_storage.header().centroids_len,
(num_clusters * num_features * size_of::<f32>() + size_of::<u64>()) as u64
);
// Verify doc_id_mapping content
for i in 0..num_vectors {
let ref_id = builder
.doc_id_mapping()
.get(i as u32)
.expect("Failed to read doc_id from FileBackedAppendableVectorStorage");
let read_id = index
.index_storage
.get_doc_id(i)
.expect("Failed to read doc_id from FixedFileVectorStorage");
assert_eq!(ref_id.len(), 1);
assert_eq!((*ref_id)[0], read_id);
}
// Verify centroid content
for i in 0..num_clusters {
let ref_vector = builder
Expand Down
Loading

0 comments on commit ff23bd9

Please sign in to comment.