diff --git a/Cargo.lock b/Cargo.lock index 2b9e55a4..5ed649ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -77,9 +77,9 @@ checksum = "bf7d0a018de4f6aa429b9d33d69edf69072b1c5b1cb8d3e4a5f7ef898fc3eb76" [[package]] name = "arrow" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "edb738d83750ec705808f6d44046d165e6bb8623f64e29a4d53fcb136ab22dfb" +checksum = "5bc25126d18a012146a888a0298f2c22e1150327bd2765fc76d710a556b2d614" dependencies = [ "ahash", "arrow-arith", @@ -97,9 +97,9 @@ dependencies = [ [[package]] name = "arrow-arith" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c5c3d17fc5b006e7beeaebfb1d2edfc92398b981f82d9744130437909b72a468" +checksum = "34ccd45e217ffa6e53bbb0080990e77113bdd4e91ddb84e97b77649810bcf1a7" dependencies = [ "arrow-array", "arrow-buffer", @@ -112,9 +112,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "55705ada5cdde4cb0f202ffa6aa756637e33fea30e13d8d0d0fd6a24ffcee1e3" +checksum = "6bda9acea48b25123c08340f3a8ac361aa0f74469bb36f5ee9acf923fce23e9d" dependencies = [ "ahash", "arrow-buffer", @@ -128,9 +128,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a722f90a09b94f295ab7102542e97199d3500128843446ef63e410ad546c5333" +checksum = "01a0fc21915b00fc6c2667b069c1b64bdd920982f426079bc4a7cab86822886c" dependencies = [ "bytes", "half", @@ -139,15 +139,16 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af01fc1a06f6f2baf31a04776156d47f9f31ca5939fe6d00cd7a059f95a46ff1" +checksum = "5dc0368ed618d509636c1e3cc20db1281148190a78f43519487b2daf07b63b4a" dependencies = [ "arrow-array", "arrow-buffer", "arrow-data", "arrow-schema", "arrow-select", + "base64", "chrono", "half", "lexical-core", @@ -156,9 +157,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d0a547195e607e625e7fafa1a7269b8df1a4a612c919efd9b26bd86e74538f3a" +checksum = "907fafe280a3874474678c1858b9ca4cb7fd83fb8034ff5b6d6376205a08c634" dependencies = [ "arrow-buffer", "arrow-schema", @@ -178,9 +179,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e36bf091502ab7e37775ff448413ef1ffff28ff93789acb669fffdd51b394d51" +checksum = "79a43d6808411886b8c7d4f6f7dd477029c1e77ffffffb7923555cc6579639cd" dependencies = [ "arrow-array", "arrow-buffer", @@ -192,9 +193,9 @@ dependencies = [ [[package]] name = "arrow-ord" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4502123d2397319f3a13688432bc678c61cb1582f2daa01253186da650bf5841" +checksum = "9b23b0e53c0db57c6749997fd343d4c0354c994be7eca67152dd2bdb9a3e1bb4" dependencies = [ "arrow-array", "arrow-buffer", @@ -207,9 +208,9 @@ dependencies = [ [[package]] name = "arrow-row" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "249fc5a07906ab3f3536a6e9f118ec2883fbcde398a97a5ba70053f0276abda4" +checksum = "361249898d2d6d4a6eeb7484be6ac74977e48da12a4dd81a708d620cc558117a" dependencies = [ "ahash", "arrow-array", @@ -222,18 +223,18 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9d7a8c3f97f5ef6abd862155a6f39aaba36b029322462d72bbcfa69782a50614" +checksum = "09e28a5e781bf1b0f981333684ad13f5901f4cd2f20589eab7cf1797da8fc167" dependencies = [ "bitflags 2.4.0", ] [[package]] name = "arrow-select" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f868f4a5001429e20f7c1994b5cd1aa68b82e3db8cf96c559cdb56dc8be21410" +checksum = "4f6208466590960efc1d2a7172bc4ff18a67d6e25c529381d7f96ddaf0dc4036" dependencies = [ "ahash", "arrow-array", @@ -245,9 +246,9 @@ dependencies = [ [[package]] name = "arrow-string" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a27fdf8fc70040a2dee78af2e217479cb5b263bd7ab8711c7999e74056eb688a" +checksum = "a4a48149c63c11c9ff571e50ab8f017d2a7cb71037a882b42f6354ed2da9acc7" dependencies = [ "arrow-array", "arrow-buffer", @@ -262,7 +263,7 @@ dependencies = [ [[package]] name = "arrow-wasm" version = "0.1.0" -source = "git+https://github.com/kylebarron/arrow-wasm?rev=40363b64fc8bbb8c4a2fb8a30156f8811182dada#40363b64fc8bbb8c4a2fb8a30156f8811182dada" +source = "git+https://github.com/kylebarron/arrow-wasm?rev=564a7485fcd585647ba77e2c5d8f2cff6db02122#564a7485fcd585647ba77e2c5d8f2cff6db02122" dependencies = [ "arrow", "arrow-schema", @@ -1132,9 +1133,9 @@ dependencies = [ [[package]] name = "parquet" -version = "48.0.0" +version = "49.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "239229e6a668ab50c61de3dce61cf0fa1069345f7aa0f4c934491f92205a4945" +checksum = "af88740a842787da39b3d69ce5fbf6fce97d20211d3b299fee0a0da6430c74d4" dependencies = [ "ahash", "arrow-array", @@ -1197,7 +1198,7 @@ dependencies = [ "wasm-bindgen-test", "wasm-streams", "web-sys", - "zstd 0.12.4", + "zstd 0.13.0", ] [[package]] diff --git a/README.md b/README.md index f5ee4c31..08f59f21 100644 --- a/README.md +++ b/README.md @@ -20,6 +20,8 @@ npm install parquet-wasm ### Two APIs? +**Important Note!**: the maintainer of arrow2 and parquet2 has stepped back, and therefore I plan to deprecate the parquet2-based API in the future. See issue [#308](https://github.com/kylebarron/parquet-wasm/issues/308). + These bindings expose _two_ APIs to users because there are _two separate implementations_ of Parquet and Arrow in Rust. - [`parquet`](https://crates.io/crates/parquet) and [`arrow`](https://crates.io/crates/arrow): These are the "official" Rust implementations of Arrow and Parquet. These projects started earlier and may be more feature complete. @@ -77,13 +79,8 @@ The WASM bundle must be compiled with the `console_error_panic_hook` for this fu ## Example ```js -import { tableFromArrays, tableFromIPC, tableToIPC } from "apache-arrow"; -import { - readParquet, - writeParquet, - Compression, - WriterPropertiesBuilder, -} from "parquet-wasm"; +import * as arrow from "apache-arrow"; +import * as parquet from "parquet-wasm"; // Create Arrow Table in JS const LENGTH = 2000; @@ -96,47 +93,53 @@ const rainDates = Array.from( (_, i) => new Date(Date.now() - 1000 * 60 * 60 * 24 * i) ); -const rainfall = tableFromArrays({ +const rainfall = arrow.tableFromArrays({ precipitation: rainAmounts, date: rainDates, }); // Write Arrow Table to Parquet -const writerProperties = new WriterPropertiesBuilder() - .setCompression(Compression.ZSTD) + +// wasmTable is an Arrow table in WebAssembly memory +const wasmTable = parquet.Table.fromIPCStream(arrow.tableToIPC(rainfall, "stream")); +const writerProperties = new parquet.WriterPropertiesBuilder() + .setCompression(parquet.Compression.ZSTD) .build(); -const parquetBuffer = writeParquet( - tableToIPC(rainfall, "stream"), - writerProperties -); +const parquetUint8Array = parquet.writeParquet(wasmTable, writerProperties); // Read Parquet buffer back to Arrow Table -const table = tableFromIPC(readParquet(parquetBuffer)); +// arrowWasmTable is an Arrow table in WebAssembly memory +const arrowWasmTable = parquet.readParquet(parquetUint8Array); + +// table is now an Arrow table in JS memory +const table = arrow.tableFromIPC(arrowWasmTable.intoIPCStream()); console.log(table.schema.toString()); // Schema<{ 0: precipitation: Float32, 1: date: Date64 }> ``` ### Published examples +(These may use older versions of the library with a different API). + - [GeoParquet on the Web (Observable)](https://observablehq.com/@kylebarron/geoparquet-on-the-web) - [Hello, Parquet-WASM (Observable)](https://observablehq.com/@bmschmidt/hello-parquet-wasm) ## Performance considerations -> Tl;dr: Try the new - [`readParquetFFI`](https://kylebarron.dev/parquet-wasm/modules/bundler_arrow2.html#readParquetFFI) - API, new in 0.4.0. This API is less well tested than the "normal" `readParquet` API, but should be +> Tl;dr: When you have a `Table` object (resulting from `readParquet`), try the new + [`Table.intoFFI`](https://kylebarron.dev/parquet-wasm/classes/bundler_arrow2.Table.html#intoFFI) + API to move it to JavaScript memory. This API is less well tested than the [`Table.intoIPCStream`](https://kylebarron.dev/parquet-wasm/classes/bundler_arrow2.Table.html#intoIPCStream) API, but should be faster and have **much** less memory overhead (by a factor of 2). If you hit any bugs, please [create a reproducible issue](https://github.com/kylebarron/parquet-wasm/issues/new). Under the hood, `parquet-wasm` first decodes a Parquet file into Arrow _in WebAssembly memory_. But then that WebAssembly memory needs to be copied into JavaScript for use by Arrow JS. The "normal" -read APIs (e.g. `readParquet`) use the [Arrow IPC +conversion APIs (e.g. `Table.intoIPCStream`) use the [Arrow IPC format](https://arrow.apache.org/docs/python/ipc.html) to get the data back to JavaScript. But this requires another memory copy _inside WebAssembly_ to assemble the various arrays into a single buffer to be copied back to JS. -Instead, the new `readParquetFFI` API uses Arrow's [C Data +Instead, the new `Table.intoFFI` API uses Arrow's [C Data Interface](https://arrow.apache.org/docs/format/CDataInterface.html) to be able to copy or view Arrow arrays from within WebAssembly memory without any serialization. @@ -151,21 +154,19 @@ and the Arrow C Data Interface if you want to read more! ### Example ```js -import { Table } from "apache-arrow"; +import * as arrow from "apache-arrow"; import { parseRecordBatch } from "arrow-js-ffi"; // Edit the `parquet-wasm` import as necessary -import { readParquetFFI, __wasm } from "parquet-wasm/node2"; +import { readParquet, wasmMemory } from "parquet-wasm/node2"; -// A reference to the WebAssembly memory object. The way to access this is different for each -// environment. In Node, use the __wasm export as shown below. In ESM the memory object will -// be found on the returned default export. -const WASM_MEMORY = __wasm.memory; +// A reference to the WebAssembly memory object. +const WASM_MEMORY = wasmMemory(); const resp = await fetch("https://example.com/file.parquet"); const parquetUint8Array = new Uint8Array(await resp.arrayBuffer()); -const wasmArrowTable = readParquetFFI(parquetUint8Array); +const wasmArrowTable = readParquet(parquetUint8Array).intoFFI(); -const recordBatches = []; +const recordBatches: arrow.RecordBatch[] = []; for (let i = 0; i < wasmArrowTable.numBatches(); i++) { // Note: Unless you know what you're doing, setting `true` below is recommended to _copy_ // table data from WebAssembly into JavaScript memory. This may become the default in the @@ -179,7 +180,7 @@ for (let i = 0; i < wasmArrowTable.numBatches(); i++) { recordBatches.push(recordBatch); } -const table = new Table(recordBatches); +const table = new arrow.Table(recordBatches); // VERY IMPORTANT! You must call `drop` on the Wasm table object when you're done using it // to release the Wasm memory.