Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

arrow2-convert migration 1: TUID, RowId, TableId #3853

Merged
merged 13 commits into from
Oct 16, 2023
Merged
5 changes: 2 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 2 additions & 3 deletions crates/re_arrow_store/src/store_arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ use arrow2::{array::Array, chunk::Chunk, datatypes::Schema};
use nohash_hasher::IntMap;
use re_log_types::{
DataCellColumn, DataTable, DataTableResult, RowId, Timeline, COLUMN_INSERT_ID,
COLUMN_NUM_INSTANCES, COLUMN_ROW_ID,
COLUMN_NUM_INSTANCES,
};
use re_types::ComponentName;

Expand Down Expand Up @@ -149,8 +149,7 @@ fn serialize_control_columns(
columns.push(insert_id_column);
}

let (row_id_field, row_id_column) =
DataTable::serialize_control_column(COLUMN_ROW_ID, col_row_id)?;
let (row_id_field, row_id_column) = DataTable::serialize_control_column(col_row_id)?;
schema.fields.push(row_id_field);
columns.push(row_id_column);

Expand Down
9 changes: 4 additions & 5 deletions crates/re_arrow_store/src/store_sanity.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
use re_log_types::{
DataCellColumn, SizeBytes as _, TimeRange, COLUMN_NUM_INSTANCES, COLUMN_ROW_ID,
COLUMN_TIMEPOINT,
DataCellColumn, RowId, SizeBytes as _, TimeRange, COLUMN_NUM_INSTANCES, COLUMN_TIMEPOINT,
};
use re_types::ComponentName;
use re_types::{ComponentName, Loggable};

use crate::{DataStore, IndexedBucket, IndexedBucketInner, IndexedTable, PersistentIndexedTable};

Expand Down Expand Up @@ -193,7 +192,7 @@ impl IndexedBucket {
(!col_insert_id.is_empty())
.then(|| (DataStore::insert_id_key(), col_insert_id.len())), //
Some((COLUMN_TIMEPOINT.into(), col_time.len())),
Some((COLUMN_ROW_ID.into(), col_row_id.len())),
Some((RowId::name(), col_row_id.len())),
Some((COLUMN_NUM_INSTANCES.into(), col_num_instances.len())),
]
.into_iter()
Expand Down Expand Up @@ -274,7 +273,7 @@ impl PersistentIndexedTable {
let column_lengths = [
(!col_insert_id.is_empty())
.then(|| (DataStore::insert_id_key(), col_insert_id.len())), //
Some((COLUMN_ROW_ID.into(), col_row_id.len())),
Some((RowId::name(), col_row_id.len())),
Some((COLUMN_NUM_INSTANCES.into(), col_num_instances.len())),
]
.into_iter()
Expand Down
3 changes: 1 addition & 2 deletions crates/re_format/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,5 @@ all-features = true

[dependencies]
arrow2.workspace = true
arrow2_convert.workspace = true
comfy-table.workspace = true
re_tuid = { workspace = true, features = ["arrow2_convert"] }
re_tuid = { workspace = true, features = ["arrow"] }
17 changes: 9 additions & 8 deletions crates/re_format/src/arrow.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@
use std::fmt::Formatter;

use arrow2::{
array::{get_display, Array, ListArray, StructArray},
array::{get_display, Array, ListArray},
datatypes::{DataType, IntervalUnit, TimeUnit},
};
use arrow2_convert::deserialize::TryIntoCollection;
use comfy_table::{presets, Cell, Table};

use re_tuid::Tuid;
use re_tuid::{external::re_types::Loggable as _, Tuid};

// ---

Expand Down Expand Up @@ -39,8 +38,7 @@ pub fn get_custom_display<'a, F: std::fmt::Write + 'a>(

if let Some(DataType::Extension(name, _, _)) = datatype {
// TODO(#1775): This should be registered dynamically.
// NOTE: Can't call `Tuid::name()`, `Component` lives in `re_log_types`.
if name.as_str() == "rerun.tuid" {
if name.as_str() == Tuid::name() {
return Box::new(|w, index| {
if let Some(tuid) = parse_tuid(array, index) {
w.write_fmt(format_args!("{tuid}"))
Expand Down Expand Up @@ -68,9 +66,8 @@ fn parse_tuid(array: &dyn Array, index: usize) -> Option<Tuid> {
// New control columns: it's not a list to begin with!
_ => (array.to_boxed(), index),
};
let array = array.as_any().downcast_ref::<StructArray>()?;

let tuids: Vec<Tuid> = TryIntoCollection::try_into_collection(array.to_boxed()).ok()?;
let tuids = Tuid::from_arrow(array.as_ref()).ok()?;
tuids.get(index).copied()
}

Expand Down Expand Up @@ -228,7 +225,11 @@ where
.map(|(name, data_type)| {
Cell::new(format!(
"{}\n---\n{}",
name.replace("rerun.components.", "").replace("rerun.", ""),
name.replace("rerun.archetypes.", "")
.replace("rerun.components.", "")
.replace("rerun.datatypes.", "")
.replace("rerun.controls.", "")
.replace("rerun.", ""),
teh-cmc marked this conversation as resolved.
Show resolved Hide resolved
DisplayDataType(data_type.clone())
))
});
Expand Down
2 changes: 1 addition & 1 deletion crates/re_log_types/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ re_format.workspace = true
re_log.workspace = true
re_string_interner.workspace = true
re_tracing.workspace = true
re_tuid = { workspace = true, features = ["arrow2_convert"] }
re_tuid = { workspace = true, features = ["arrow"] }
re_types = { workspace = true, features = ["image"] }

# External
Expand Down
17 changes: 3 additions & 14 deletions crates/re_log_types/src/data_row.rs
Original file line number Diff line number Diff line change
Expand Up @@ -106,21 +106,8 @@ impl SizeBytes for DataCellRow {
// ---

/// A unique ID for a [`DataRow`].
#[derive(
Debug,
Clone,
Copy,
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
arrow2_convert::ArrowField,
arrow2_convert::ArrowSerialize,
arrow2_convert::ArrowDeserialize,
)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[arrow_field(transparent)]
pub struct RowId(pub(crate) re_tuid::Tuid);

impl std::fmt::Display for RowId {
Expand Down Expand Up @@ -170,6 +157,8 @@ impl std::ops::DerefMut for RowId {
}
}

re_tuid::delegate_arrow_tuid!(RowId as "rerun.controls.RowId");

/// A row's worth of data, i.e. an event: a list of [`DataCell`]s associated with an auto-generated
/// `RowId`, a user-specified [`TimePoint`] and [`EntityPath`], and an expected number of
/// instances.
Expand Down
72 changes: 47 additions & 25 deletions crates/re_log_types/src/data_table.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
use std::collections::BTreeMap;

use re_types::ComponentName;
use re_types::{ComponentName, Loggable};

use ahash::HashMap;
use itertools::{izip, Itertools as _};
Expand Down Expand Up @@ -36,6 +36,12 @@ pub enum DataTableError {
#[error("Could not serialize/deserialize component instances to/from Arrow: {0}")]
Arrow(#[from] arrow2::error::Error),

#[error("Could not serialize component instances to/from Arrow: {0}")]
Serialization(#[from] re_types::SerializationError),

#[error("Could not deserialize component instances to/from Arrow: {0}")]
Deserialization(#[from] re_types::DeserializationError),

// Needed to handle TryFrom<T> -> T
#[error("Infallible")]
Unreachable(#[from] std::convert::Infallible),
Expand Down Expand Up @@ -128,21 +134,8 @@ impl SizeBytes for DataCellColumn {
// ---

/// A unique ID for a [`DataTable`].
#[derive(
Debug,
Clone,
Copy,
PartialEq,
Eq,
PartialOrd,
Ord,
Hash,
arrow2_convert::ArrowField,
arrow2_convert::ArrowSerialize,
arrow2_convert::ArrowDeserialize,
)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[cfg_attr(feature = "serde", derive(serde::Deserialize, serde::Serialize))]
#[arrow_field(transparent)]
pub struct TableId(pub(crate) re_tuid::Tuid);

impl std::fmt::Display for TableId {
Expand Down Expand Up @@ -183,6 +176,8 @@ impl std::ops::DerefMut for TableId {
}
}

re_tuid::delegate_arrow_tuid!(TableId as "rerun.controls.TableId");

/// A sparse table's worth of data, i.e. a batch of events: a collection of [`DataRow`]s.
/// This is the top-level layer in our data model.
///
Expand Down Expand Up @@ -557,7 +552,6 @@ use arrow2_convert::{
// TODO(#1696): Those names should come from the datatypes themselves.

pub const COLUMN_INSERT_ID: &str = "rerun.insert_id";
pub const COLUMN_ROW_ID: &str = "rerun.row_id";
pub const COLUMN_TIMEPOINT: &str = "rerun.timepoint";
pub const COLUMN_ENTITY_PATH: &str = "rerun.entity_path";
pub const COLUMN_NUM_INSTANCES: &str = "rerun.num_instances";
Expand All @@ -566,7 +560,6 @@ pub const METADATA_KIND: &str = "rerun.kind";
pub const METADATA_KIND_DATA: &str = "data";
pub const METADATA_KIND_CONTROL: &str = "control";
pub const METADATA_KIND_TIME: &str = "time";
pub const METADATA_TABLE_ID: &str = "rerun.table_id";

impl DataTable {
/// Serializes the entire table into an arrow payload and schema.
Expand Down Expand Up @@ -669,13 +662,12 @@ impl DataTable {
let mut schema = Schema::default();
let mut columns = Vec::new();

let (row_id_field, row_id_column) =
Self::serialize_control_column(COLUMN_ROW_ID, col_row_id)?;
let (row_id_field, row_id_column) = Self::serialize_control_column(col_row_id)?;
schema.fields.push(row_id_field);
columns.push(row_id_column);

let (entity_path_field, entity_path_column) =
Self::serialize_control_column(COLUMN_ENTITY_PATH, col_entity_path)?;
Self::serialize_control_column_legacy(COLUMN_ENTITY_PATH, col_entity_path)?;
schema.fields.push(entity_path_field);
columns.push(entity_path_column);

Expand All @@ -687,13 +679,39 @@ impl DataTable {
schema.fields.push(num_instances_field);
columns.push(num_instances_column);

schema.metadata = [(METADATA_TABLE_ID.into(), table_id.to_string())].into();
schema.metadata = [(TableId::name().to_string(), table_id.to_string())].into();

Ok((schema, columns))
}

/// Serializes a single control column: an iterable of dense arrow-like data.
pub fn serialize_control_column<C: ArrowSerialize + ArrowField<Type = C> + 'static>(
pub fn serialize_control_column<'a, C: re_types::Component + 'a>(
values: &'a [C],
) -> DataTableResult<(Field, Box<dyn Array>)>
where
std::borrow::Cow<'a, C>: std::convert::From<&'a C>,
{
re_tracing::profile_function!();

let data: Box<dyn Array> = C::to_arrow(values)?;

// TODO(#3360): rethink our extension and metadata usage
let mut field = C::arrow_field()
.with_metadata([(METADATA_KIND.to_owned(), METADATA_KIND_CONTROL.to_owned())].into());

// TODO(#3360): rethink our extension and metadata usage
if let DataType::Extension(name, _, _) = data.data_type() {
field
.metadata
.extend([("ARROW:extension:name".to_owned(), name.clone())]);
}

Ok((field, data))
}

/// Serializes a single control column: an iterable of dense arrow-like data.
// TODO(#3741): remove once arrow2_convert is fully gone
pub fn serialize_control_column_legacy<C: ArrowSerialize + ArrowField<Type = C> + 'static>(
name: &str,
values: &[C],
) -> DataTableResult<(Field, Box<dyn Array>)> {
Expand Down Expand Up @@ -922,8 +940,12 @@ impl DataTable {
};

// NOTE: the unwrappings cannot fail since control_index() makes sure the index is valid
let col_row_id =
(&**chunk.get(control_index(COLUMN_ROW_ID)?).unwrap()).try_into_collection()?;
let col_row_id = RowId::from_arrow(
chunk
.get(control_index(RowId::name().as_str())?)
.unwrap()
.as_ref(),
)?;
let col_entity_path =
(&**chunk.get(control_index(COLUMN_ENTITY_PATH)?).unwrap()).try_into_collection()?;
// TODO(#3741): This is unnecessarily slow…
Expand Down Expand Up @@ -956,7 +978,7 @@ impl DataTable {

Ok(Self {
table_id,
col_row_id,
col_row_id: col_row_id.into(),
col_timelines,
col_entity_path,
col_num_instances,
Expand Down
4 changes: 2 additions & 2 deletions crates/re_log_types/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ pub use self::data_row::{
pub use self::data_table::{
DataCellColumn, DataCellOptVec, DataTable, DataTableError, DataTableResult, EntityPathVec,
ErasedTimeVec, NumInstancesVec, RowIdVec, TableId, TimePointVec, COLUMN_ENTITY_PATH,
COLUMN_INSERT_ID, COLUMN_NUM_INSTANCES, COLUMN_ROW_ID, COLUMN_TIMEPOINT, METADATA_KIND,
METADATA_KIND_CONTROL, METADATA_KIND_DATA,
COLUMN_INSERT_ID, COLUMN_NUM_INSTANCES, COLUMN_TIMEPOINT, METADATA_KIND, METADATA_KIND_CONTROL,
METADATA_KIND_DATA,
};
pub use self::index::*;
pub use self::path::*;
Expand Down
13 changes: 8 additions & 5 deletions crates/re_tuid/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ all-features = true
[features]
default = []

## Enable converting Tuid to arrow2
arrow2_convert = ["dep:arrow2", "dep:arrow2_convert"]
## Enable (de)serialization using Arrow.
arrow = ["dep:re_types", "dep:arrow2", "dep:thiserror"]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

having re_tuid depend on re_types feels very backwards to me. re_tuid is a very low-level building block. This would be like adding a dependency on re_types to the uuid crate. I think it makes much more sense to impl re_types::Loggable for Tuid in re_types instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I guess if we start to add these kinds of external integrations into re_types, we might as well just split the core traits into their own crate already and avoid the intermediate headache.

I'll do the split in a follow up.


## Enable (de)serialization using serde.
serde = ["dep:serde"]
Expand All @@ -32,10 +32,13 @@ getrandom = "0.2"
once_cell.workspace = true
web-time.workspace = true

# Optional dependencies:
arrow2 = { workspace = true, optional = true } # used by arrow2_convert
arrow2_convert = { workspace = true, optional = true }
# Optional dependencies

re_types = { workspace = true, optional = true }

arrow2 = { workspace = true, optional = true }
serde = { version = "1", features = ["derive"], optional = true }
thiserror = { workspace = true, optional = true }

[dev-dependencies]
criterion = "0.5"
Expand Down
Loading