diff --git a/parquet/src/arrow/async_reader/mod.rs b/parquet/src/arrow/async_reader/mod.rs index 4f3befe4266..5323251b07e 100644 --- a/parquet/src/arrow/async_reader/mod.rs +++ b/parquet/src/arrow/async_reader/mod.rs @@ -611,11 +611,23 @@ impl std::fmt::Debug for StreamState { } } -/// An asynchronous [`Stream`](https://docs.rs/futures/latest/futures/stream/trait.Stream.html) of [`RecordBatch`] -/// for a parquet file that can be constructed using [`ParquetRecordBatchStreamBuilder`]. +/// An asynchronous [`Stream`]of [`RecordBatch`] constructed using [`ParquetRecordBatchStreamBuilder`] to read parquet files. /// /// `ParquetRecordBatchStream` also provides [`ParquetRecordBatchStream::next_row_group`] for fetching row groups, /// allowing users to decode record batches separately from I/O. +/// +/// # I/O Buffering +/// +/// `ParquetRecordBatchStream` buffers *all* data pages selected after predicates +/// (projection + filtering, etc) and decodes the rows from those buffered pages. +/// +/// For example, if all rows and columns are selected, the entire row group is +/// buffered in memory during decode. This minimizes the number of IO operations +/// required, which is especially important for object stores, where IO operations +/// have latencies in the hundreds of milliseconds +/// +/// +/// [`Stream`]: https://docs.rs/futures/latest/futures/stream/trait.Stream.html pub struct ParquetRecordBatchStream { metadata: Arc,