Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(parquet): Make parquet WASM loader into the primary loader #3121

Merged
merged 6 commits into from
Oct 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions modules/loader-utils/src/format-types.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

/**
* A worker loader definition that can be used with `@loaders.gl/core` functions
* An object that describes a format
*/
export type Format = {
/** Human readable name */
Expand All @@ -18,7 +22,6 @@ export type Format = {
binary?: boolean;
/** Is this a text format */
text?: boolean;

/** Test some initial bytes of content to see if this loader might be a match */
tests?: (((ArrayBuffer: ArrayBuffer) => boolean) | ArrayBuffer | string)[];
};
25 changes: 16 additions & 9 deletions modules/parquet/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,22 +6,29 @@ export {Buffer} from './polyfills/buffer/install-buffer-polyfill';

// import {ArrowTable, ArrowTableBatch} from '@loaders.gl/arrow';

export {ParquetFormat} from './parquet-format';

export {
ParquetWorkerLoader,
ParquetLoader,
ParquetJSONWorkerLoader,
ParquetJSONLoader,
GeoParquetWorkerLoader,
GeoParquetLoader,
ParquetColumnarWorkerLoader,
ParquetColumnarLoader
} from './parquet-loader';
// deprecated
ParquetJSONWorkerLoader as ParquetWorkerLoader,
ParquetJSONLoader as ParquetLoader
} from './parquet-json-loader';

export {ParquetWriter as _ParquetWriter} from './parquet-writer';
export {
ParquetJSONWriter as _ParquetJSONWriter,
// deprecated
ParquetJSONWriter as ParquetWriter
} from './parquet-json-writer';

// EXPERIMENTAL - expose Parquet WASM loaders/writer

export type {ParquetWasmLoaderOptions} from './parquet-wasm-loader';
export {ParquetWasmLoader, ParquetWasmWorkerLoader} from './parquet-wasm-loader';
export {ParquetWasmWriter} from './parquet-wasm-writer';
export type {ParquetArrowLoaderOptions} from './parquet-arrow-loader';
export {ParquetArrowLoader, ParquetArrowWorkerLoader} from './parquet-arrow-loader';
export {ParquetArrowWriter} from './parquet-arrow-writer';

// EXPERIMENTAL - expose the internal parquetjs API

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,14 @@ import type {ArrowTable} from '@loaders.gl/schema';
import * as arrow from 'apache-arrow';
import {loadWasm} from '../utils/load-wasm';

import type {ParquetWriterOptions} from '../../parquet-wasm-writer';
import type {ParquetArrowWriterOptions} from '../../parquet-arrow-writer';

/**
* Encode Arrow arrow.Table to Parquet buffer
*/
export async function encode(
export async function encodeArrowToParquet(
table: ArrowTable,
options: ParquetWriterOptions
options: ParquetArrowWriterOptions
): Promise<ArrayBuffer> {
const wasmUrl = options.parquet?.wasmUrl!;
const wasm = await loadWasm(wasmUrl);
Expand Down
2 changes: 1 addition & 1 deletion modules/parquet/src/lib/parsers/get-parquet-schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,9 @@

// loaders.gl
import {Schema} from '@loaders.gl/schema';
import {unpackGeoMetadata, unpackJSONStringMetadata} from '@loaders.gl/gis';
import {ParquetReader} from '../../parquetjs/parser/parquet-reader';
import {convertParquetSchema} from '../arrow/convert-schema-from-parquet';
import {unpackGeoMetadata, unpackJSONStringMetadata} from '@loaders.gl/gis';

export async function getSchemaFromParquetReader(reader: ParquetReader): Promise<Schema> {
const parquetSchema = await reader.getSchema();
Expand Down
36 changes: 36 additions & 0 deletions modules/parquet/src/lib/parsers/parse-geoparquet-to-geojson.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
// loaders.gl
// SPDX-License-Identifier: MIT
// Copyright (c) vis.gl contributors

import type {ReadableFile} from '@loaders.gl/loader-utils';
import type {
GeoJSONTable,
GeoJSONTableBatch,
ObjectRowTable,
ObjectRowTableBatch
} from '@loaders.gl/schema';
// import {convertGeoArrowToTable} from '@loaders.gl/gis';

import type {ParquetJSONLoaderOptions} from '../../parquet-json-loader';

import {parseParquetFile, parseParquetFileInBatches} from './parse-parquet-to-json';

export async function parseGeoParquetFile(
file: ReadableFile,
options?: ParquetJSONLoaderOptions
): Promise<ObjectRowTable | GeoJSONTable> {
const table = await parseParquetFile(file, {...options, shape: 'object-row-table'});
// return convertGeoArrowToTable(table, 'geojson-table');
return table;
}

export async function* parseGeoParquetFileInBatches(
file: ReadableFile,
options?: ParquetJSONLoaderOptions
): AsyncIterable<ObjectRowTableBatch | GeoJSONTableBatch> {
const tableBatches = parseParquetFileInBatches(file, {...options, shape: 'object-row-table'});

for await (const batch of tableBatches) {
yield batch; // convertGeoArrowToTable(batch, 'geojson-table');
}
}
81 changes: 0 additions & 81 deletions modules/parquet/src/lib/parsers/parse-geoparquet.ts

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ import * as arrow from 'apache-arrow';
import {loadWasm} from '../utils/load-wasm';
import {makeStreamIterator} from '../utils/make-stream-iterator';

export async function parseParquetFileWasm(
export async function parseParquetFileToArrow(
file: ReadableFile,
options?: parquetWasm.ReaderOptions & {wasmUrl: string}
): Promise<ArrowTable> {
Expand All @@ -40,7 +40,7 @@ export async function parseParquetFileWasm(
};
}

export async function* parseParquetFileInBatchesWasm(
export async function* parseParquetFileToArrowInBatches(
file: ReadableFile,
options: parquetWasm.ReaderOptions & {wasmUrl: string}
): AsyncIterable<ArrowTableBatch> {
Expand Down
8 changes: 4 additions & 4 deletions modules/parquet/src/lib/parsers/parse-parquet-to-columns.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import type {ColumnarTable, ColumnarTableBatch, Schema} from '@loaders.gl/schema';
import type {ReadableFile} from '@loaders.gl/loader-utils';
import type {ParquetLoaderOptions} from '../../parquet-loader';
import type {ParquetJSONLoaderOptions} from '../../parquet-json-loader';
import {ParquetReader} from '../../parquetjs/parser/parquet-reader';
import {ParquetRowGroup} from '../../parquetjs/schema/declare';
import {ParquetSchema} from '../../parquetjs/schema/schema';
Expand All @@ -14,11 +14,11 @@ import {installBufferPolyfill} from '../../polyfills/buffer/index';
import {preloadCompressions} from '../../parquetjs/compression';

/**
* @deprecated
* @deprecated - Use parseParquetToArrow
*/
export async function parseParquetFileInColumns(
file: ReadableFile,
options?: ParquetLoaderOptions
options?: ParquetJSONLoaderOptions
): Promise<ColumnarTable> {
installBufferPolyfill();
await preloadCompressions(options);
Expand All @@ -38,7 +38,7 @@ export async function parseParquetFileInColumns(
*/
export async function* parseParquetFileInColumnarBatches(
file: ReadableFile,
options?: ParquetLoaderOptions
options?: ParquetJSONLoaderOptions
): AsyncIterable<ColumnarTableBatch> {
installBufferPolyfill();
await preloadCompressions(options);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import {default as log} from '@probe.gl/log';
import type {ReadableFile} from '@loaders.gl/loader-utils';
import type {ObjectRowTable, ObjectRowTableBatch} from '@loaders.gl/schema';

import type {ParquetLoaderOptions} from '../../parquet-loader';
import type {ParquetJSONLoaderOptions} from '../../parquet-json-loader';
import type {ParquetRow} from '../../parquetjs/schema/declare';
import {ParquetReader} from '../../parquetjs/parser/parquet-reader';
import {getSchemaFromParquetReader} from './get-parquet-schema';
Expand All @@ -21,7 +21,7 @@ import {preloadCompressions} from '../../parquetjs/compression';
*/
export async function parseParquetFile(
file: ReadableFile,
options?: ParquetLoaderOptions
options?: ParquetJSONLoaderOptions
): Promise<ObjectRowTable> {
installBufferPolyfill();
await preloadCompressions(options);
Expand Down Expand Up @@ -56,8 +56,7 @@ export async function parseParquetFile(
data: rows
};

const shape = options?.parquet?.shape;
return convertTable(objectRowTable, shape);
return objectRowTable;
}

/**
Expand All @@ -67,7 +66,7 @@ export async function parseParquetFile(
*/
export async function* parseParquetFileInBatches(
file: ReadableFile,
options?: ParquetLoaderOptions
options?: ParquetJSONLoaderOptions
): AsyncIterable<ObjectRowTableBatch> {
installBufferPolyfill();
await preloadCompressions(options);
Expand All @@ -84,31 +83,12 @@ export async function* parseParquetFileInBatches(
schema,
data: rows
};
const shape = options?.parquet?.shape;
const table = convertTable(objectRowTable, shape);

yield {
batchType: 'data',
schema,
...table,
...objectRowTable,
length: rows.length
};
}
}

function convertTable(
objectRowTable: ObjectRowTable,
shape?: 'object-row-table' | 'geojson-table'
): ObjectRowTable {
switch (shape) {
case 'object-row-table':
return objectRowTable;

// Hack until geoparquet fixes up forwarded shape
case 'geojson-table':
return objectRowTable;

default:
throw new Error(shape);
}
}
Loading
Loading