From 04f6c572cb00ea80ec4a0242f72174dde29358be Mon Sep 17 00:00:00 2001 From: Connor Tsui Date: Tue, 13 Feb 2024 10:12:45 -0500 Subject: [PATCH] remove extra dependencies --- eggstrain/Cargo.toml | 2 - eggstrain/src/storage_client/mod.rs | 114 ++++++++++++++-------------- 2 files changed, 58 insertions(+), 58 deletions(-) diff --git a/eggstrain/Cargo.toml b/eggstrain/Cargo.toml index 84dfb86..a1260a7 100644 --- a/eggstrain/Cargo.toml +++ b/eggstrain/Cargo.toml @@ -14,8 +14,6 @@ authors = [ anyhow = "1" arrow = "50" async-trait = "0.1" -datafusion = "35" substrait = "0.24" tokio = { version = "1", features = ["full"] } -tokio-stream = "0.1" rayon = "1" diff --git a/eggstrain/src/storage_client/mod.rs b/eggstrain/src/storage_client/mod.rs index 20b9a7f..bb1613f 100644 --- a/eggstrain/src/storage_client/mod.rs +++ b/eggstrain/src/storage_client/mod.rs @@ -1,61 +1,63 @@ //! Right now we have this in a submodule `storage_client.rs`, but the IO service //! team would probably create a crate and we could import it easily into our `Cargo.toml` file -use datafusion::execution::SendableRecordBatchStream; +// use datafusion::execution::SendableRecordBatchStream; + +// use datafusion::common::arrow::array::{Int32Array, RecordBatch}; +// use datafusion::common::arrow::datatypes::{DataType, Field, Schema}; +// use std::sync::Arc; + +// // Placeholder types to let this compile +// type ColumnId = String; +// type TableId = String; +// type RecordId = usize; + +// /// For now, pretend that this is an opaque type that the +// /// I/O Service team will provide to us in a crate. +// /// This type should be `Sync` as well, to support +// /// multiple instances of a `StorageClient`. +// pub struct StorageClient; + +// /// Have some way to request specific types of data. +// /// As long as it comes back as a `RecordBatch`, +// /// we should be fine to have any type of request here. +// pub enum BlobData { +// Table(TableId), +// Columns(TableId, Box<[ColumnId]>), +// Tuple(RecordId), +// } + +// impl StorageClient { +// /// Have some sort of way to create a `StorageClient` on our local node. +// pub fn new(_id: usize) -> Self { +// Self +// } + +// /// The only other function we need exposed would be a way to actually get data. +// /// What we should get is a stream of `Recordbatch`s, which is just Apache Arrow +// /// data in memory. +// /// +// /// The executor node really should not know what the underlying data is on the Blob data store. +// /// In our case it is Parquet, but since the Execution Engine is not in charge or loading +// /// those Parquet files, it should just receive it as in-memory Arrow data +// /// +// /// Note that we will likely re-export the `SendableRecordBatchRecord` from DataFusion +// /// and use that as the return type instead +// pub async fn request_data(&self, _request: BlobData) -> SendableRecordBatchStream { +// todo!() +// } + +// pub async fn sample_request_data(_request: BlobData) -> SendableRecordBatchStream { +// todo!("Return some sample data") +// } + +// /// https://docs.rs/datafusion/latest/datafusion/common/arrow/array/struct.RecordBatch.html +// pub async fn request_synchronous_data() -> RecordBatch { +// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); +// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); + +// RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap() +// } +// } -use datafusion::common::arrow::array::{Int32Array, RecordBatch}; -use datafusion::common::arrow::datatypes::{DataType, Field, Schema}; -use std::sync::Arc; - -// Placeholder types to let this compile -type ColumnId = String; -type TableId = String; -type RecordId = usize; - -/// For now, pretend that this is an opaque type that the -/// I/O Service team will provide to us in a crate. -/// This type should be `Sync` as well, to support -/// multiple instances of a `StorageClient`. pub struct StorageClient; - -/// Have some way to request specific types of data. -/// As long as it comes back as a `RecordBatch`, -/// we should be fine to have any type of request here. -pub enum BlobData { - Table(TableId), - Columns(TableId, Box<[ColumnId]>), - Tuple(RecordId), -} - -impl StorageClient { - /// Have some sort of way to create a `StorageClient` on our local node. - pub fn new(_id: usize) -> Self { - Self - } - - /// The only other function we need exposed would be a way to actually get data. - /// What we should get is a stream of `Recordbatch`s, which is just Apache Arrow - /// data in memory. - /// - /// The executor node really should not know what the underlying data is on the Blob data store. - /// In our case it is Parquet, but since the Execution Engine is not in charge or loading - /// those Parquet files, it should just receive it as in-memory Arrow data - /// - /// Note that we will likely re-export the `SendableRecordBatchRecord` from DataFusion - /// and use that as the return type instead - pub async fn request_data(&self, _request: BlobData) -> SendableRecordBatchStream { - todo!() - } - - pub async fn sample_request_data(_request: BlobData) -> SendableRecordBatchStream { - todo!("Return some sample data") - } - - /// https://docs.rs/datafusion/latest/datafusion/common/arrow/array/struct.RecordBatch.html - pub async fn request_synchronous_data() -> RecordBatch { - let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]); - let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]); - - RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap() - } -}