From 3066e3b33b2e91cb14a1ff03a5dfd784919ae3ba Mon Sep 17 00:00:00 2001 From: Ib Green Date: Sun, 28 Jan 2024 17:34:26 -0500 Subject: [PATCH] feat(parquet): restore ParquetWasm loader --- .../src/lib/get-schema-from-fgb-header.ts | 1 + modules/geotiff/test/tiff.spec.ts | 2 +- modules/parquet/package.json | 6 +- modules/parquet/src/index.ts | 11 ++-- modules/parquet/src/{ => lib}/constants.ts | 6 +- .../src/lib/wasm/encode-parquet-wasm.ts | 66 +++++++++++++------ .../parquet/src/lib/wasm/load-wasm-browser.ts | 19 ------ .../parquet/src/lib/wasm/load-wasm-node.ts | 9 --- modules/parquet/src/lib/wasm/load-wasm.ts | 19 +++++- .../src/lib/wasm/parse-parquet-wasm.ts | 44 +++++-------- modules/parquet/src/parquet-wasm-loader.ts | 12 ++-- modules/parquet/src/parquet-wasm-writer.ts | 20 ++++-- .../parquet/src/parquetjs/parser/decoders.ts | 2 +- .../src/parquetjs/parser/parquet-reader.ts | 2 +- modules/parquet/test/index.ts | 3 +- ...s.disabled => parquet-wasm-loader.spec.ts} | 23 ++----- modules/zip/src/parse-zip/zip-composition.ts | 5 ++ package.json | 6 +- yarn.lock | 8 +-- 19 files changed, 137 insertions(+), 127 deletions(-) rename modules/parquet/src/{ => lib}/constants.ts (59%) delete mode 100644 modules/parquet/src/lib/wasm/load-wasm-browser.ts delete mode 100644 modules/parquet/src/lib/wasm/load-wasm-node.ts rename modules/parquet/test/{parquet-wasm-loader.spec.ts.disabled => parquet-wasm-loader.spec.ts} (82%) diff --git a/modules/flatgeobuf/src/lib/get-schema-from-fgb-header.ts b/modules/flatgeobuf/src/lib/get-schema-from-fgb-header.ts index 17203a9fe6..1df9390f30 100644 --- a/modules/flatgeobuf/src/lib/get-schema-from-fgb-header.ts +++ b/modules/flatgeobuf/src/lib/get-schema-from-fgb-header.ts @@ -86,6 +86,7 @@ enum fgbColumnType { } /** Convert FGB types to arrow like types */ +// eslint-disable-next-line complexity function getTypeFromFGBType(fgbType: fgbColumnType /* fgb.ColumnMeta['type'] */): DataType { switch (fgbType) { case fgbColumnType.Byte: diff --git a/modules/geotiff/test/tiff.spec.ts b/modules/geotiff/test/tiff.spec.ts index 8d708c53bf..69be96baf9 100644 --- a/modules/geotiff/test/tiff.spec.ts +++ b/modules/geotiff/test/tiff.spec.ts @@ -60,7 +60,7 @@ test('Correct OME-XML.', async (t) => { const tiff = await fromFile(TIFF_URL); const {metadata} = await loadGeoTiff(tiff); const {Name, Pixels} = metadata; - t.equal(Name, 'multi-channel.ome.tif', "Name should be 'multi-channel.ome.tif'."); + t.equal(Name, 'multi-channel.ome.tif', 'Name should be "multi-channel.ome.tif".'); // @ts-ignore t.equal(Pixels.SizeC, 3, 'Should have three channels.'); // @ts-ignore diff --git a/modules/parquet/package.json b/modules/parquet/package.json index 62405a84bb..21123827af 100644 --- a/modules/parquet/package.json +++ b/modules/parquet/package.json @@ -53,9 +53,7 @@ "util": false, "events": false, "./src/polyfills/buffer/buffer-polyfill.node.ts": "./src/polyfills/buffer/buffer-polyfill.browser.ts", - "./dist/polyfills/buffer/buffer-polyfill.node.js": "./dist/polyfills/buffer/buffer-polyfill.browser.js", - "./src/lib/wasm/load-wasm-node.ts": "./src/lib/wasm/load-wasm-browser.ts", - "./dist/lib/wasm/load-wasm-node.js": "./dist/lib/wasm/load-wasm-browser.js" + "./dist/polyfills/buffer/buffer-polyfill.node.js": "./dist/polyfills/buffer/buffer-polyfill.browser.js" }, "comments": [ "base64-js and ieee754 are used by buffer polyfill" @@ -76,7 +74,7 @@ "lz4js": "^0.2.0", "node-int64": "^0.4.0", "object-stream": "0.0.1", - "parquet-wasm": "^0.3.1", + "parquet-wasm": "^0.6.0-beta.1", "snappyjs": "^0.6.0", "thrift": "^0.19.0", "util": "^0.12.5", diff --git a/modules/parquet/src/index.ts b/modules/parquet/src/index.ts index bd7292cc4f..90983fa5fe 100644 --- a/modules/parquet/src/index.ts +++ b/modules/parquet/src/index.ts @@ -15,12 +15,13 @@ export { ParquetColumnarLoader } from './parquet-loader'; -// import type {ParquetWasmLoaderOptions} from './lib/wasm/parse-parquet-wasm'; -// import {parseParquetWasm} from './lib/wasm/parse-parquet-wasm'; -// import {ParquetWasmLoader as ParquetWasmWorkerLoader} from './parquet-wasm-loader'; - export {ParquetWriter as _ParquetWriter} from './parquet-writer'; -// export {ParquetWasmWriter} from './parquet-wasm-writer'; + +// EXPERIMENTAL - expose Parquet WASM loaders/writer + +export type {ParquetWasmLoaderOptions} from './parquet-wasm-loader'; +export {ParquetWasmLoader, ParquetWasmWorkerLoader} from './parquet-wasm-loader'; +export {ParquetWasmWriter} from './parquet-wasm-writer'; // EXPERIMENTAL - expose the internal parquetjs API diff --git a/modules/parquet/src/constants.ts b/modules/parquet/src/lib/constants.ts similarity index 59% rename from modules/parquet/src/constants.ts rename to modules/parquet/src/lib/constants.ts index d93a3ca991..b9861233c7 100644 --- a/modules/parquet/src/constants.ts +++ b/modules/parquet/src/lib/constants.ts @@ -2,7 +2,11 @@ // SPDX-License-Identifier: MIT // Copyright (c) vis.gl contributors -// Forked from https://github.com/kbajalc/parquets under MIT license (Copyright (c) 2017 ironSource Ltd.) +// __VERSION__ is injected by babel-plugin-version-inline +// @ts-ignore TS2304: Cannot find name '__VERSION__'. +export const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest'; +export const PARQUET_WASM_URL = 'https://unpkg.com/parquet-wasm@0.6.0-beta.1/esm/arrow1_bg.wasm'; + /** * Parquet File Magic String */ diff --git a/modules/parquet/src/lib/wasm/encode-parquet-wasm.ts b/modules/parquet/src/lib/wasm/encode-parquet-wasm.ts index 7203db757f..28eaca3838 100644 --- a/modules/parquet/src/lib/wasm/encode-parquet-wasm.ts +++ b/modules/parquet/src/lib/wasm/encode-parquet-wasm.ts @@ -2,39 +2,67 @@ // SPDX-License-Identifier: MIT // Copyright (c) vis.gl contributors -import type {WriterOptions} from '@loaders.gl/loader-utils'; import type {ArrowTable} from '@loaders.gl/arrow'; import * as arrow from 'apache-arrow'; import {loadWasm} from './load-wasm'; -export type ParquetWriterOptions = WriterOptions & { - parquet?: { - wasmUrl?: string; - }; -}; +import type {ParquetWriterOptions} from '../../parquet-wasm-writer'; /** * Encode Arrow arrow.Table to Parquet buffer */ export async function encode( table: ArrowTable, - options?: ParquetWriterOptions + options: ParquetWriterOptions ): Promise { - const wasmUrl = options?.parquet?.wasmUrl; + const wasmUrl = options.parquet?.wasmUrl!; const wasm = await loadWasm(wasmUrl); + // Serialize the table to the IPC format. const arrowTable: arrow.Table = table.data; + const ipcStream = arrow.tableToIPC(arrowTable); - // Serialize a table to the IPC format. - const writer = arrow.RecordBatchStreamWriter.writeAll(arrowTable); - const arrowIPCBytes = writer.toUint8Array(true); - - // TODO: provide options for how to write table. - const writerProperties = new wasm.WriterPropertiesBuilder().build(); - const parquetBytes = wasm.writeParquet(arrowIPCBytes, writerProperties); - return parquetBytes.buffer.slice( - parquetBytes.byteOffset, - parquetBytes.byteLength + parquetBytes.byteOffset - ); + // Pass the IPC stream to the Parquet writer. + const wasmTable = wasm.Table.fromIPCStream(ipcStream); + const wasmProperties = new wasm.WriterPropertiesBuilder().build(); + try { + const parquetBytes = wasm.writeParquet(wasmTable, wasmProperties); + // const parquetBytes = wasm.writeParquet(wasmTable, wasmProperties); + return parquetBytes.buffer.slice( + parquetBytes.byteOffset, + parquetBytes.byteLength + parquetBytes.byteOffset + ); + } finally { + // wasmTable.free(); + // wasmProperties.free(); + } } + +// type WriteOptions = { +// compression?: number; +// dictionaryEnabled?: boolean; +// encoding?: number; +// maxRowGroupSize?: number; +// maxStatisticsSize?: number; +// statisticsEnabled?: boolean; +// writeBatchSize?: number; +// dataPageSizeLimit?: number; +// dictionaryPageSizeLimit?: number; +// }; + +// columnCompression: Record; +// columnDictionaryEnabled: Record; +// columnEncoding: Record; +// columnMaxStatisticsSize +// compression:Record; +// setCreatedBy +// setDataPageSizeLimit +// setDictionaryEnabled +// setDictionaryPageSizeLimit +// setEncoding +// setMaxRowGroupSize +// setMaxStatisticsSize +// setStatisticsEnabled +// setWriteBatchSize +// setWriterVersion diff --git a/modules/parquet/src/lib/wasm/load-wasm-browser.ts b/modules/parquet/src/lib/wasm/load-wasm-browser.ts deleted file mode 100644 index 13f35a1d7c..0000000000 --- a/modules/parquet/src/lib/wasm/load-wasm-browser.ts +++ /dev/null @@ -1,19 +0,0 @@ -// loaders.gl -// SPDX-License-Identifier: MIT -// Copyright (c) vis.gl contributors - -import * as wasmEsm from 'parquet-wasm/esm2/arrow1'; - -let cached: typeof wasmEsm | null = null; - -export async function loadWasm(wasmUrl?: string) { - if (cached !== null) { - return cached; - } - - // For ESM bundles, need to await the default export, which loads the WASM - await wasmEsm.default(wasmUrl); - cached = wasmEsm; - - return wasmEsm; -} diff --git a/modules/parquet/src/lib/wasm/load-wasm-node.ts b/modules/parquet/src/lib/wasm/load-wasm-node.ts deleted file mode 100644 index 5ee44eebb6..0000000000 --- a/modules/parquet/src/lib/wasm/load-wasm-node.ts +++ /dev/null @@ -1,9 +0,0 @@ -// loaders.gl -// SPDX-License-Identifier: MIT -// Copyright (c) vis.gl contributors - -import * as wasmNode from 'parquet-wasm/node/arrow1'; - -export async function loadWasm(wasmUrl?: string) { - return wasmNode; -} diff --git a/modules/parquet/src/lib/wasm/load-wasm.ts b/modules/parquet/src/lib/wasm/load-wasm.ts index 6c89cf2b1f..0c7c2d447a 100644 --- a/modules/parquet/src/lib/wasm/load-wasm.ts +++ b/modules/parquet/src/lib/wasm/load-wasm.ts @@ -2,4 +2,21 @@ // SPDX-License-Identifier: MIT // Copyright (c) vis.gl contributors -export {loadWasm} from './load-wasm-node'; +// eslint-disable-next-line import/default +import initWasm from 'parquet-wasm'; +import * as parquetWasm from 'parquet-wasm'; +import {PARQUET_WASM_URL} from '../constants'; + +let initializePromise: any; + +export async function loadWasm(wasmUrl: string = PARQUET_WASM_URL) { + if (!initializePromise && typeof initWasm === 'function') { + if (!wasmUrl) { + throw new Error('ParquetLoader: No wasmUrl provided'); + } + // @ts-ignore + initializePromise = initWasm(wasmUrl); + } + await initializePromise; + return parquetWasm; +} diff --git a/modules/parquet/src/lib/wasm/parse-parquet-wasm.ts b/modules/parquet/src/lib/wasm/parse-parquet-wasm.ts index b86e77eb3b..3ae930fd28 100644 --- a/modules/parquet/src/lib/wasm/parse-parquet-wasm.ts +++ b/modules/parquet/src/lib/wasm/parse-parquet-wasm.ts @@ -3,43 +3,31 @@ // Copyright (c) vis.gl contributors // eslint-disable -import type {LoaderOptions} from '@loaders.gl/loader-utils'; import type {ArrowTable} from '@loaders.gl/arrow'; import {serializeArrowSchema} from '@loaders.gl/arrow'; -import * as arrow from 'apache-arrow'; +import type {ParquetWasmLoaderOptions} from '../../parquet-wasm-loader'; import {loadWasm} from './load-wasm'; - -export type ParquetWasmLoaderOptions = LoaderOptions & { - parquet?: { - type?: 'arrow-table'; - wasmUrl?: string; - }; -}; +import * as arrow from 'apache-arrow'; export async function parseParquetWasm( arrayBuffer: ArrayBuffer, - options?: ParquetWasmLoaderOptions + options: ParquetWasmLoaderOptions ): Promise { + const arr = new Uint8Array(arrayBuffer); + const wasmUrl = options?.parquet?.wasmUrl; const wasm = await loadWasm(wasmUrl); + const wasmTable = wasm.readParquet(arr); + try { + const ipcStream = wasmTable.intoIPCStream(); + const arrowTable = arrow.tableFromIPC(ipcStream); - const arr = new Uint8Array(arrayBuffer); - const arrowIPCUint8Arr = wasm.readParquet(arr); - const arrowIPCBuffer = arrowIPCUint8Arr.buffer.slice( - arrowIPCUint8Arr.byteOffset, - arrowIPCUint8Arr.byteLength + arrowIPCUint8Arr.byteOffset - ); - - const reader = arrow.RecordBatchStreamReader.from(arrowIPCBuffer); - const recordBatches: arrow.RecordBatch[] = []; - for (const recordBatch of reader) { - recordBatches.push(recordBatch); + return { + shape: 'arrow-table', + schema: serializeArrowSchema(arrowTable.schema), + data: arrowTable + }; + } finally { + // wasmTable.free(); } - const arrowTable = new arrow.Table(recordBatches); - - return { - shape: 'arrow-table', - schema: serializeArrowSchema(arrowTable.schema), - data: arrowTable - }; } diff --git a/modules/parquet/src/parquet-wasm-loader.ts b/modules/parquet/src/parquet-wasm-loader.ts index 2fda7c68fb..8f6bc82fe9 100644 --- a/modules/parquet/src/parquet-wasm-loader.ts +++ b/modules/parquet/src/parquet-wasm-loader.ts @@ -6,10 +6,7 @@ import type {Loader, LoaderWithParser, LoaderOptions} from '@loaders.gl/loader-u import type {ArrowTable} from '@loaders.gl/arrow'; import {parseParquetWasm} from './lib/wasm/parse-parquet-wasm'; - -// __VERSION__ is injected by babel-plugin-version-inline -// @ts-ignore TS2304: Cannot find name '__VERSION__'. -const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest'; +import {VERSION, PARQUET_WASM_URL} from './lib/constants'; /** Parquet WASM loader options */ export type ParquetWasmLoaderOptions = LoaderOptions & { @@ -34,7 +31,7 @@ export const ParquetWasmWorkerLoader: Loader = { ...ParquetWasmWorkerLoader, - parse: parseParquetWasm + parse(arrayBuffer: ArrayBuffer, options?: ParquetWasmLoaderOptions) { + options = {parquet: {...ParquetWasmLoader.options.parquet, ...options?.parquet}, ...options}; + return parseParquetWasm(arrayBuffer, options); + } }; diff --git a/modules/parquet/src/parquet-wasm-writer.ts b/modules/parquet/src/parquet-wasm-writer.ts index 673fb318c9..36c225ffa1 100644 --- a/modules/parquet/src/parquet-wasm-writer.ts +++ b/modules/parquet/src/parquet-wasm-writer.ts @@ -4,11 +4,16 @@ import type {WriterWithEncoder} from '@loaders.gl/loader-utils'; import type {ArrowTable} from '@loaders.gl/arrow'; -import {encode, ParquetWriterOptions} from './lib/wasm/encode-parquet-wasm'; +import {encode} from './lib/wasm/encode-parquet-wasm'; +import type {WriterOptions} from '@loaders.gl/loader-utils'; -// __VERSION__ is injected by babel-plugin-version-inline -// @ts-ignore TS2304: Cannot find name '__VERSION__'. -const VERSION = typeof __VERSION__ !== 'undefined' ? __VERSION__ : 'latest'; +import {VERSION, PARQUET_WASM_URL} from './lib/constants'; + +export type ParquetWriterOptions = WriterOptions & { + parquet?: { + wasmUrl?: string; + }; +}; /** Parquet WASM writer */ export const ParquetWasmWriter: WriterWithEncoder = { @@ -21,8 +26,11 @@ export const ParquetWasmWriter: WriterWithEncoder { test('Load Parquet file', async (t) => { const url = `${PARQUET_DIR}/geoparquet/example.parquet`; - const table = await load(url, ParquetWasmLoader, { - parquet: { - wasmUrl: WASM_URL - } - }); + const table = await load(url, ParquetWasmLoader, {}); const arrowTable = table.data; t.equal(arrowTable.numRows, 5); t.deepEqual(table.schema?.fields.map((f) => f.name), [ @@ -41,17 +36,13 @@ test('Load Parquet file', async (t) => { t.end(); }); -test('ParquetWasmLoader#load', async (t) => { +test.only('ParquetWasmLoader#load', async (t) => { t.comment('SUPPORTED FILES'); for (const {title, path} of WASM_SUPPORTED_FILES) { const url = `${PARQUET_DIR}/apache/${path}`; - const table = await load(url, ParquetWasmLoader, { - parquet: { - wasmUrl: WASM_URL - } - }); + const table = await load(url, ParquetWasmLoader); const arrowTable = table.data; - t.ok(arrowTable, `GOOD(${title})`); + t.ok(arrowTable instanceof arrow.Table, `GOOD(${title})`); } t.end(); @@ -62,15 +53,9 @@ test('ParquetWasmWriter#writer/loader round trip', async (t) => { const parquetBuffer = await encode(table, ParquetWasmWriter, { worker: false, - parquet: { - wasmUrl: WASM_URL - } }); const newTable = await load(parquetBuffer, ParquetWasmLoader, { worker: false, - parquet: { - wasmUrl: WASM_URL - } }); t.deepEqual(table.data.schema, newTable.data.schema); diff --git a/modules/zip/src/parse-zip/zip-composition.ts b/modules/zip/src/parse-zip/zip-composition.ts index a78e92ed85..f9fa8079f7 100644 --- a/modules/zip/src/parse-zip/zip-composition.ts +++ b/modules/zip/src/parse-zip/zip-composition.ts @@ -200,12 +200,17 @@ export function getFileIterator( * @param subfolder relative path from the root folder. * @returns list of paths */ +<<<<<<< HEAD export async function getAllFiles( basePath: string, subfolder: string = '', fsPassed?: NodeFilesystem ): Promise { const fs = fsPassed ? fsPassed : new NodeFilesystem({}); +======= +export async function getAllFiles(basePath: string, subfolder: string = ''): Promise { + fs ||= new NodeFilesystem({}); +>>>>>>> 7e5932d46 (feat(parquet): restore ParquetWasm loader) const files = await fs.readdir(pathJoin(basePath, subfolder)); const arrayOfFiles: string[] = []; diff --git a/package.json b/package.json index 8155eca42a..6b7726b1e2 100644 --- a/package.json +++ b/package.json @@ -56,16 +56,18 @@ "dependencies": {}, "resolutions_notes": [ "Ensure we use recent typescript", - "Note: prettier 3 required for JSON import assertions" + "Note: prettier 3 required for JSON import assertions", + "apache-arrow version should be aligned with parquet-wasm version" ], "resolutions": { "@typescript-eslint/eslint-plugin": "^6.0.0", "@typescript-eslint/parser": "^6.0.0", + "apache-arrow": "^15.0.0", "prettier": "3.0.3", "typescript": "^5.3.0" }, "volta": { - "node": "18.18.2", + "node": "18.19.0", "yarn": "1.22.19" } } diff --git a/yarn.lock b/yarn.lock index e15586f265..77692bcf59 100644 --- a/yarn.lock +++ b/yarn.lock @@ -9139,10 +9139,10 @@ parent-module@^1.0.0: dependencies: callsites "^3.0.0" -parquet-wasm@^0.3.1: - version "0.3.1" - resolved "https://registry.yarnpkg.com/parquet-wasm/-/parquet-wasm-0.3.1.tgz#4c7306cac00e1bfacb00ee9e395a9ef8a4c8623f" - integrity sha512-8OMYwh7N+IcQBeKTuDgpvJR7xh/YKPgnQcs29CQW0Q4ziwroRu7N6tQmmsjiCaQ/W7enBNppquj6VAe2fA1yYQ== +parquet-wasm@^0.6.0-beta.1: + version "0.6.0-beta.1" + resolved "https://registry.yarnpkg.com/parquet-wasm/-/parquet-wasm-0.6.0-beta.1.tgz#f86245a69802aa44d947182237f84fabc394654f" + integrity sha512-uA4w7XEssSub8+1zBmlnd9KlIKIfK43Syw2GrWK4pi0YG7N1ohx5T5PQbeNAqBjGZMFeC6bGw5FdGVAyKu9r1Q== parse-data-uri@^0.2.0: version "0.2.0"