Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(indexer): fan out when index code chunks #3496

Merged
merged 4 commits into from
Dec 2, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 32 additions & 18 deletions crates/tabby-index/src/indexer.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::collections::HashSet;
use std::{collections::HashSet, sync::mpsc::sync_channel};

use anyhow::{bail, Result};
use async_stream::stream;
Expand Down Expand Up @@ -76,35 +76,49 @@ impl<T: ToIndexId> TantivyDocBuilder<T> {
let doc_id = id.clone();
let doc_attributes = self.builder.build_attributes(&document).await;
let s = stream! {
let mut failed_count: u64 = 0;
let (tx, rx) = sync_channel(32);

for await chunk_doc in self.build_chunks(cloned_id, source_id.clone(), updated_at, document).await {
match chunk_doc.await {
Ok(Ok(doc)) => {
yield tokio::spawn(async move { Some(doc) });
}
Ok(Err(e)) => {
warn!("Failed to build chunk for document '{}': {}", doc_id, e);
failed_count += 1;
}
Err(e) => {
warn!("Failed to call build chunk '{}': {}", doc_id, e);
failed_count += 1;
let tx = tx.clone();
let doc_id = doc_id.clone();
yield tokio::spawn(async move {
match chunk_doc.await {
Ok(Ok(doc)) => {
Some(doc)
}
Ok(Err(e)) => {
warn!("Failed to build chunk for document '{}': {}", doc_id, e);
tx.send(1).unwrap();
None
}
Err(e) => {
warn!("Failed to call build chunk '{}': {}", doc_id, e);
tx.send(1).unwrap();
None
}
}
}
});
};

// drop tx to signal the end of the stream
// the cloned is dropped in its own thread
drop(tx);

let mut doc = doc! {
schema.field_id => doc_id,
schema.field_source_id => source_id,
schema.field_corpus => self.corpus,
schema.field_attributes => doc_attributes,
schema.field_updated_at => updated_at,
};
if failed_count > 0 {
doc.add_u64(schema.field_failed_chunks_count, failed_count);
}

yield tokio::spawn(async move { Some(doc) });
yield tokio::spawn(async move {
let failed_count = rx.iter().count();
if failed_count > 0 {
doc.add_u64(schema.field_failed_chunks_count, failed_count as u64);
}
Some(doc)
});
};

(id, s)
Expand Down
12 changes: 10 additions & 2 deletions crates/tabby-index/src/indexer_tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -195,14 +195,18 @@ mod builder_tests {
std::thread::available_parallelism().unwrap().get() * 2,
32,
))
.map(|handler| handler.unwrap())
.collect::<Vec<_>>()
.await
.into_iter()
.flatten()
.collect::<Vec<_>>()
});

// the chunks should be failed as no embedding is provided
// the last element is the document itself
assert_eq!(res.len(), 1);
let doc = res.last().unwrap().as_ref().unwrap().as_ref().unwrap();
let doc = res.last().unwrap();

let schema = IndexSchema::instance();
let failed_count = doc
Expand Down Expand Up @@ -250,16 +254,20 @@ mod builder_tests {
std::thread::available_parallelism().unwrap().get() * 2,
32,
))
.map(|handler| handler.unwrap())
.collect::<Vec<_>>()
.await
.into_iter()
.flatten()
.collect::<Vec<_>>()
});

// The last element is the document itself,
// while the preceding elements are the chunks.
// Given that the embedding is empty,
// all chunks should be considered failed and skipped.
assert_eq!(res.len(), 1);
let doc = res.last().unwrap().as_ref().unwrap().as_ref().unwrap();
let doc = res.last().unwrap();

let schema = IndexSchema::instance();
let failed_count = doc
Expand Down
Loading