Skip to content
This repository has been archived by the owner on Jun 6, 2024. It is now read-only.

Commit

Permalink
remove extra dependencies
Browse files Browse the repository at this point in the history
  • Loading branch information
connortsui20 committed Feb 13, 2024
1 parent 762376b commit 04f6c57
Show file tree
Hide file tree
Showing 2 changed files with 58 additions and 58 deletions.
2 changes: 0 additions & 2 deletions eggstrain/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,6 @@ authors = [
anyhow = "1"
arrow = "50"
async-trait = "0.1"
datafusion = "35"
substrait = "0.24"
tokio = { version = "1", features = ["full"] }
tokio-stream = "0.1"
rayon = "1"
114 changes: 58 additions & 56 deletions eggstrain/src/storage_client/mod.rs
Original file line number Diff line number Diff line change
@@ -1,61 +1,63 @@
//! Right now we have this in a submodule `storage_client.rs`, but the IO service
//! team would probably create a crate and we could import it easily into our `Cargo.toml` file
use datafusion::execution::SendableRecordBatchStream;
// use datafusion::execution::SendableRecordBatchStream;

// use datafusion::common::arrow::array::{Int32Array, RecordBatch};
// use datafusion::common::arrow::datatypes::{DataType, Field, Schema};
// use std::sync::Arc;

// // Placeholder types to let this compile
// type ColumnId = String;
// type TableId = String;
// type RecordId = usize;

// /// For now, pretend that this is an opaque type that the
// /// I/O Service team will provide to us in a crate.
// /// This type should be `Sync` as well, to support
// /// multiple instances of a `StorageClient`.
// pub struct StorageClient;

// /// Have some way to request specific types of data.
// /// As long as it comes back as a `RecordBatch`,
// /// we should be fine to have any type of request here.
// pub enum BlobData {
// Table(TableId),
// Columns(TableId, Box<[ColumnId]>),
// Tuple(RecordId),
// }

// impl StorageClient {
// /// Have some sort of way to create a `StorageClient` on our local node.
// pub fn new(_id: usize) -> Self {
// Self
// }

// /// The only other function we need exposed would be a way to actually get data.
// /// What we should get is a stream of `Recordbatch`s, which is just Apache Arrow
// /// data in memory.
// ///
// /// The executor node really should not know what the underlying data is on the Blob data store.
// /// In our case it is Parquet, but since the Execution Engine is not in charge or loading
// /// those Parquet files, it should just receive it as in-memory Arrow data
// ///
// /// Note that we will likely re-export the `SendableRecordBatchRecord` from DataFusion
// /// and use that as the return type instead
// pub async fn request_data(&self, _request: BlobData) -> SendableRecordBatchStream {
// todo!()
// }

// pub async fn sample_request_data(_request: BlobData) -> SendableRecordBatchStream {
// todo!("Return some sample data")
// }

// /// https://docs.rs/datafusion/latest/datafusion/common/arrow/array/struct.RecordBatch.html
// pub async fn request_synchronous_data() -> RecordBatch {
// let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
// let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);

// RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap()
// }
// }

use datafusion::common::arrow::array::{Int32Array, RecordBatch};
use datafusion::common::arrow::datatypes::{DataType, Field, Schema};
use std::sync::Arc;

// Placeholder types to let this compile
type ColumnId = String;
type TableId = String;
type RecordId = usize;

/// For now, pretend that this is an opaque type that the
/// I/O Service team will provide to us in a crate.
/// This type should be `Sync` as well, to support
/// multiple instances of a `StorageClient`.
pub struct StorageClient;

/// Have some way to request specific types of data.
/// As long as it comes back as a `RecordBatch`,
/// we should be fine to have any type of request here.
pub enum BlobData {
Table(TableId),
Columns(TableId, Box<[ColumnId]>),
Tuple(RecordId),
}

impl StorageClient {
/// Have some sort of way to create a `StorageClient` on our local node.
pub fn new(_id: usize) -> Self {
Self
}

/// The only other function we need exposed would be a way to actually get data.
/// What we should get is a stream of `Recordbatch`s, which is just Apache Arrow
/// data in memory.
///
/// The executor node really should not know what the underlying data is on the Blob data store.
/// In our case it is Parquet, but since the Execution Engine is not in charge or loading
/// those Parquet files, it should just receive it as in-memory Arrow data
///
/// Note that we will likely re-export the `SendableRecordBatchRecord` from DataFusion
/// and use that as the return type instead
pub async fn request_data(&self, _request: BlobData) -> SendableRecordBatchStream {
todo!()
}

pub async fn sample_request_data(_request: BlobData) -> SendableRecordBatchStream {
todo!("Return some sample data")
}

/// https://docs.rs/datafusion/latest/datafusion/common/arrow/array/struct.RecordBatch.html
pub async fn request_synchronous_data() -> RecordBatch {
let id_array = Int32Array::from(vec![1, 2, 3, 4, 5]);
let schema = Schema::new(vec![Field::new("id", DataType::Int32, false)]);

RecordBatch::try_new(Arc::new(schema), vec![Arc::new(id_array)]).unwrap()
}
}

0 comments on commit 04f6c57

Please sign in to comment.