Skip to content

Commit

Permalink
Extend DocumentDeserialize with a stateful variant DocumentDeserializ…
Browse files Browse the repository at this point in the history
…eSeed

This is modelled on Serde's `DeserializeSeed` include how the relevant API entry
points gain a `_seed` variant. It can be used for example to obtain runtime
field ID values when deserializing a struct field by field without relying on
the order of the fields as written to/read from the document store.
  • Loading branch information
adamreichold committed May 23, 2024
1 parent b806122 commit e9c16a4
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 12 deletions.
28 changes: 25 additions & 3 deletions src/core/searcher.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
use std::collections::BTreeMap;
use std::marker::PhantomData;
use std::sync::Arc;
use std::{fmt, io};

use crate::collector::Collector;
use crate::core::Executor;
use crate::index::{SegmentId, SegmentReader};
use crate::query::{Bm25StatisticsProvider, EnableScoring, Query};
use crate::schema::document::DocumentDeserialize;
use crate::schema::document::{DocumentDeserialize, DocumentDeserializeSeed};
use crate::schema::{Schema, Term};
use crate::space_usage::SearcherSpaceUsage;
use crate::store::{CacheStats, StoreReader};
Expand Down Expand Up @@ -86,8 +87,17 @@ impl Searcher {
/// The searcher uses the segment ordinal to route the
/// request to the right `Segment`.
pub fn doc<D: DocumentDeserialize>(&self, doc_address: DocAddress) -> crate::Result<D> {
self.doc_seed(doc_address, PhantomData)
}

/// A stateful variant of [`doc`][Self::doc].`
pub fn doc_seed<T: DocumentDeserializeSeed>(
&self,
doc_address: DocAddress,
seed: T,
) -> crate::Result<T::Value> {
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get(doc_address.doc_id)
store_reader.get_seed(doc_address.doc_id, seed)
}

/// The cache stats for the underlying store reader.
Expand All @@ -109,9 +119,21 @@ impl Searcher {
&self,
doc_address: DocAddress,
) -> crate::Result<D> {
self.doc_async_seed(doc_address, PhantomData).await
}

#[cfg(feature = "quickwit")]
/// A stateful variant of [`doc_async`][Self::doc_async].
pub async fn doc_async_seed<T: DocumentDeserializeSeed>(
&self,
doc_address: DocAddress,
seed: T,
) -> crate::Result<T::Value> {
let executor = self.inner.index.search_executor();
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
store_reader.get_async(doc_address.doc_id, executor).await
store_reader
.get_async_seed(doc_address.doc_id, executor, seed)
.await
}

/// Access the schema associated with the index of this searcher.
Expand Down
22 changes: 22 additions & 0 deletions src/schema/document/de.rs
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,28 @@ pub trait DocumentDeserialize: Sized {
where D: DocumentDeserializer<'de>;
}

/// A stateful extension of [`DocumentDeserialize`].
pub trait DocumentDeserializeSeed: Sized {
/// The type produced by using this seed.
type Value;

/// Attempts to deserialize `Self::Value` from the given `seed` and `deserializer`.
fn deserialize<'de, D>(self, deserializer: D) -> Result<Self::Value, DeserializeError>
where D: DocumentDeserializer<'de>;
}

impl<T> DocumentDeserializeSeed for PhantomData<T>
where T: DocumentDeserialize
{
/// The type produced by using this seed.
type Value = T;

fn deserialize<'de, D>(self, deserializer: D) -> Result<Self::Value, DeserializeError>
where D: DocumentDeserializer<'de> {
<T as DocumentDeserialize>::deserialize(deserializer)
}
}

/// A deserializer that can walk through each entry in the document.
pub trait DocumentDeserializer<'de> {
/// A indicator as to how many values are in the document.
Expand Down
4 changes: 2 additions & 2 deletions src/schema/document/default_document.rs
Original file line number Diff line number Diff line change
Expand Up @@ -603,7 +603,7 @@ impl<'a> Iterator for CompactDocObjectIter<'a> {
container: self.container,
value,
};
return Some((key, value));
Some((key, value))
}
}

Expand Down Expand Up @@ -637,7 +637,7 @@ impl<'a> Iterator for CompactDocArrayIter<'a> {
container: self.container,
value,
};
return Some(value);
Some(value)
}
}

Expand Down
5 changes: 3 additions & 2 deletions src/schema/document/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,9 @@ use std::mem;

pub(crate) use self::de::BinaryDocumentDeserializer;
pub use self::de::{
ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializer, ObjectAccess,
ValueDeserialize, ValueDeserializer, ValueType, ValueVisitor,
ArrayAccess, DeserializeError, DocumentDeserialize, DocumentDeserializeSeed,
DocumentDeserializer, ObjectAccess, ValueDeserialize, ValueDeserializer, ValueType,
ValueVisitor,
};
pub use self::default_document::{
CompactDocArrayIter, CompactDocObjectIter, CompactDocValue, DocParsingError, TantivyDocument,
Expand Down
45 changes: 40 additions & 5 deletions src/store/reader.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::io;
use std::iter::Sum;
use std::marker::PhantomData;
use std::num::NonZeroUsize;
use std::ops::{AddAssign, Range};
use std::sync::atomic::{AtomicUsize, Ordering};
Expand All @@ -14,7 +15,9 @@ use super::Decompressor;
use crate::directory::FileSlice;
use crate::error::DataCorruption;
use crate::fastfield::AliveBitSet;
use crate::schema::document::{BinaryDocumentDeserializer, DocumentDeserialize};
use crate::schema::document::{
BinaryDocumentDeserializer, DocumentDeserialize, DocumentDeserializeSeed,
};
use crate::space_usage::StoreSpaceUsage;
use crate::store::index::Checkpoint;
use crate::DocId;
Expand Down Expand Up @@ -201,11 +204,21 @@ impl StoreReader {
/// It should not be called to score documents
/// for instance.
pub fn get<D: DocumentDeserialize>(&self, doc_id: DocId) -> crate::Result<D> {
self.get_seed(doc_id, PhantomData)
}

/// A stateful version of [`get`][Self::get].
pub fn get_seed<T: DocumentDeserializeSeed>(
&self,
doc_id: DocId,
seed: T,
) -> crate::Result<T::Value> {
let mut doc_bytes = self.get_document_bytes(doc_id)?;

let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
seed.deserialize(deserializer)
.map_err(crate::TantivyError::from)
}

/// Returns raw bytes of a given document.
Expand Down Expand Up @@ -237,16 +250,27 @@ impl StoreReader {
/// Iterator over all Documents in their order as they are stored in the doc store.
/// Use this, if you want to extract all Documents from the doc store.
/// The `alive_bitset` has to be forwarded from the `SegmentReader` or the results may be wrong.
pub fn iter<'a: 'b, 'b, D: DocumentDeserialize>(
pub fn iter<'a: 'b, 'b, D: DocumentDeserialize + 'b>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
) -> impl Iterator<Item = crate::Result<D>> + 'b {
self.iter_seed(alive_bitset, &PhantomData)
}

/// A stateful variant of [`iter`][Self::iter].
pub fn iter_seed<'a: 'b, 'b, T: DocumentDeserializeSeed + Clone + 'b>(
&'b self,
alive_bitset: Option<&'a AliveBitSet>,
seed: &'b T,
) -> impl Iterator<Item = crate::Result<T::Value>> + 'b {
self.iter_raw(alive_bitset).map(|doc_bytes_res| {
let mut doc_bytes = doc_bytes_res?;

let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
seed.clone()
.deserialize(deserializer)
.map_err(crate::TantivyError::from)
})
}

Expand Down Expand Up @@ -389,11 +413,22 @@ impl StoreReader {
doc_id: DocId,
executor: &Executor,
) -> crate::Result<D> {
self.get_async_seed(doc_id, executor, PhantomData).await
}

/// A stateful variant of [`get_async`][Self::get_async].
pub async fn get_async_seed<T: DocumentDeserializeSeed>(
&self,
doc_id: DocId,
executor: &Executor,
seed: T,
) -> crate::Result<T::Value> {
let mut doc_bytes = self.get_document_bytes_async(doc_id, executor).await?;

let deserializer = BinaryDocumentDeserializer::from_reader(&mut doc_bytes)
.map_err(crate::TantivyError::from)?;
D::deserialize(deserializer).map_err(crate::TantivyError::from)
seed.deserialize(deserializer)
.map_err(crate::TantivyError::from)
}
}

Expand Down

0 comments on commit e9c16a4

Please sign in to comment.