deephaven · lbooker42 · Jun 5, 2024 · Mar 22, 2024 · Mar 22, 2024 · Apr 2, 2024
diff --git a/extensions/iceberg/build.gradle b/extensions/iceberg/build.gradle
@@ -0,0 +1,84 @@
+plugins {
+    id 'java-library'
+    id 'io.deephaven.project.register'
+}
+
+description 'Iceberg: Support to read iceberg catalogs.'
+
+ext {
+    hadoopVersion = '3.3.6'
+}
+
+dependencies {
+    api project(':engine-api')
+    api project(':engine-table')
+
+    implementation project(':engine-base')
+    implementation project(':log-factory')
+    implementation project(':Configuration')
+
+    implementation platform('software.amazon.awssdk:bom:2.23.19')
+    implementation 'software.amazon.awssdk:s3'
+    implementation 'software.amazon.awssdk:aws-crt-client'
+
+    Classpaths.inheritAutoService(project)
+    Classpaths.inheritImmutables(project)
+
+    Classpaths.inheritJUnitPlatform(project)
+    Classpaths.inheritAssertJ(project)
+
+    Classpaths.inheritParquetHadoop(project)
+
+    testImplementation 'org.junit.jupiter:junit-jupiter'
+    testRuntimeOnly 'org.junit.jupiter:junit-jupiter-engine'
+    testRuntimeOnly 'org.junit.platform:junit-platform-launcher'
+
+    implementation project(':extensions-parquet-base')
+    implementation project(':extensions-parquet-table')
+    implementation project(':extensions-s3')
+
+    implementation platform("org.apache.iceberg:iceberg-bom:1.5.0")
+    implementation "org.apache.iceberg:iceberg-api"
+    implementation "org.apache.iceberg:iceberg-core"
+    implementation "org.apache.iceberg:iceberg-aws"
+    // The following lines can be uncommented when we enable support for Azure and GCP
+    // implementation "org.apache.iceberg:iceberg-azure"
+    // implementation "org.apache.iceberg:iceberg-gcp"
+    implementation "org.apache.iceberg:iceberg-bundled-guava"
+    runtimeOnly "org.apache.iceberg:iceberg-aws-bundle"
+
+    implementation "org.apache.hadoop:hadoop-common:${hadoopVersion}"
+    implementation "org.apache.hadoop:hadoop-hdfs-client:${hadoopVersion}"
+
+    // could be downstream configurable ?? testRuntimeOnly might be better
+    // The following lines can be uncommented when we enable support for Azure and GCP
+    // runtimeOnly "org.apache.iceberg:iceberg-azure-bundle"
+    // runtimeOnly "org.apache.iceberg:iceberg-gcp-bundle"
+
+    testImplementation "org.testcontainers:testcontainers:1.19.4"
+    testImplementation "org.testcontainers:junit-jupiter:1.19.4"
+    testImplementation "org.testcontainers:localstack:1.19.4"
+    testImplementation "org.testcontainers:minio:1.19.4"
+
+    testImplementation TestTools.projectDependency(project, 'extensions-s3')
+
+    testRuntimeOnly project(':test-configs')
+    testRuntimeOnly project(':log-to-slf4j')
+    Classpaths.inheritSlf4j(project, 'slf4j-simple', 'testRuntimeOnly')
+}
+
+test {
+    useJUnitPlatform {
+        excludeTags("testcontainers")
+    }
+}
+
+tasks.register('testOutOfBand', Test) {
+    useJUnitPlatform {
+        includeTags("testcontainers")
+    }
+    systemProperty 'testcontainers.localstack.image', project.property('testcontainers.localstack.image')
+    systemProperty 'testcontainers.minio.image', project.property('testcontainers.minio.image')
+}
+
+
diff --git a/extensions/iceberg/gradle.properties b/extensions/iceberg/gradle.properties
@@ -0,0 +1,4 @@
+io.deephaven.project.ProjectType=JAVA_PUBLIC
+
+testcontainers.localstack.image=localstack/localstack:3.1.0
+testcontainers.minio.image=minio/minio:RELEASE.2024-02-04T22-36-13Z
diff --git a/extensions/iceberg/src/main/java/io/deephaven/iceberg/layout/IcebergBaseLayout.java b/extensions/iceberg/src/main/java/io/deephaven/iceberg/layout/IcebergBaseLayout.java
@@ -0,0 +1,145 @@
+//
+// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
+//
+package io.deephaven.iceberg.layout;
+
+import io.deephaven.base.FileUtils;
+import io.deephaven.engine.table.TableDefinition;
+import io.deephaven.engine.table.impl.locations.TableDataException;
+import io.deephaven.engine.table.impl.locations.impl.TableLocationKeyFinder;
+import io.deephaven.iceberg.location.IcebergTableLocationKey;
+import io.deephaven.iceberg.location.IcebergTableParquetLocationKey;
+import io.deephaven.iceberg.util.IcebergInstructions;
+import io.deephaven.parquet.table.ParquetInstructions;
+import org.apache.iceberg.*;
+import org.apache.iceberg.io.FileIO;
+import org.jetbrains.annotations.NotNull;
+import org.jetbrains.annotations.Nullable;
+
+import java.net.URI;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.function.Consumer;
+
+public abstract class IcebergBaseLayout implements TableLocationKeyFinder<IcebergTableLocationKey> {
+    /**
+     * The {@link TableDefinition} that will be used for the table.
+     */
+    final TableDefinition tableDef;
+
+    /**
+     * The Iceberg {@link Table} to discover locations for.
+     */
+    final Table table;
+
+    /**
+     * The {@link Snapshot} to discover locations for.
+     */
+    final Snapshot snapshot;
+
+    /**
+     * The {@link FileIO} to use for passing to the catalog reading manifest data files.
+     */
+    final FileIO fileIO;
+
+    /**
+     * The instructions for customizations while reading.
+     */
+    final IcebergInstructions instructions;
+
+    /**
+     * A cache of {@link IcebergTableLocationKey IcebergTableLocationKeys} keyed by the URI of the file they represent.
+     */
+    final Map<URI, IcebergTableLocationKey> cache;
+
+    /**
+     * The {@link ParquetInstructions} object that will be used to read any Parquet data files in this table. Only
+     * accessed while synchronized on {@code this}.
+     */
+    ParquetInstructions parquetInstructions;
+
+    protected IcebergTableLocationKey locationKey(
+            final org.apache.iceberg.FileFormat format,
+            final URI fileUri,
+            @Nullable final Map<String, Comparable<?>> partitions) {
+
+        if (format == org.apache.iceberg.FileFormat.PARQUET) {
+            if (parquetInstructions == null) {
+                // Start with user-supplied instructions (if provided).
+                final ParquetInstructions.Builder builder = new ParquetInstructions.Builder();
+
+                // Add the table definition.
+                builder.setTableDefinition(tableDef);
+
+                // Add any column rename mappings.
+                if (!instructions.columnRenames().isEmpty()) {
+                    for (Map.Entry<String, String> entry : instructions.columnRenames().entrySet()) {
+                        builder.addColumnNameMapping(entry.getKey(), entry.getValue());
+                    }
+                }
+
+                // Add the S3 instructions.
+                instructions.s3Instructions().ifPresent(builder::setSpecialInstructions);
+
+                parquetInstructions = builder.build();
+            }
+            return new IcebergTableParquetLocationKey(fileUri, 0, partitions, parquetInstructions);
+        }
+        throw new UnsupportedOperationException(String.format("%s:%d - an unsupported file format %s for URI '%s'",
+                table, snapshot.snapshotId(), format, fileUri));
+    }
+
+    /**
+     * @param tableDef The {@link TableDefinition} that will be used for the table.
+     * @param table The {@link Table} to discover locations for.
+     * @param tableSnapshot The {@link Snapshot} from which to discover data files.
+     * @param fileIO The file IO to use for reading manifest data files.
+     * @param instructions The instructions for customizations while reading.
+     */
+    public IcebergBaseLayout(
+            @NotNull final TableDefinition tableDef,
+            @NotNull final Table table,
+            @NotNull final Snapshot tableSnapshot,
+            @NotNull final FileIO fileIO,
+            @NotNull final IcebergInstructions instructions) {
+        this.tableDef = tableDef;
+        this.table = table;
+        this.snapshot = tableSnapshot;
+        this.fileIO = fileIO;
+        this.instructions = instructions;
+
+        this.cache = new HashMap<>();
+    }
+
+    abstract IcebergTableLocationKey keyFromDataFile(DataFile df, URI fileUri);
+
+    @Override
+    public synchronized void findKeys(@NotNull final Consumer<IcebergTableLocationKey> locationKeyObserver) {
+        try {
+            // Retrieve the manifest files from the snapshot
+            final List<ManifestFile> manifestFiles = snapshot.allManifests(fileIO);
+            for (final ManifestFile manifestFile : manifestFiles) {
+                // Currently only can process manifest files with DATA content type.
+                if (manifestFile.content() != ManifestContent.DATA) {
+                    throw new TableDataException(
+                            String.format("%s:%d - only DATA manifest files are currently supported, encountered %s",
+                                    table, snapshot.snapshotId(), manifestFile.content()));
+                }
+                try (final ManifestReader<DataFile> reader = ManifestFiles.read(manifestFile, fileIO)) {
+                    for (DataFile df : reader) {
+                        final URI fileUri = FileUtils.convertToURI(df.path().toString(), false);
+                        final IcebergTableLocationKey locationKey =
+                                cache.computeIfAbsent(fileUri, uri -> keyFromDataFile(df, fileUri));
+                        if (locationKey != null) {
+                            locationKeyObserver.accept(locationKey);
+                        }
+                    }
+                }
+            }
+        } catch (final Exception e) {
+            throw new TableDataException(
+                    String.format("%s:%d - error finding Iceberg locations", table, snapshot.snapshotId()), e);
+        }
+    }
+}
diff --git a/extensions/iceberg/src/main/java/io/deephaven/iceberg/layout/IcebergFlatLayout.java b/extensions/iceberg/src/main/java/io/deephaven/iceberg/layout/IcebergFlatLayout.java
@@ -0,0 +1,46 @@
+//
+// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
+//
+package io.deephaven.iceberg.layout;
+
+import io.deephaven.engine.table.TableDefinition;
+import io.deephaven.engine.table.impl.locations.impl.TableLocationKeyFinder;
+import io.deephaven.iceberg.location.IcebergTableLocationKey;
+import io.deephaven.iceberg.util.IcebergInstructions;
+import org.apache.iceberg.*;
+import org.apache.iceberg.io.FileIO;
+import org.jetbrains.annotations.NotNull;
+
+import java.net.URI;
+
+/**
+ * Iceberg {@link TableLocationKeyFinder location finder} for tables without partitions that will discover data files
+ * from a {@link Snapshot}
+ */
+public final class IcebergFlatLayout extends IcebergBaseLayout {
+    /**
+     * @param tableDef The {@link TableDefinition} that will be used for the table.
+     * @param table The {@link Table} to discover locations for.
+     * @param tableSnapshot The {@link Snapshot} from which to discover data files.
+     * @param fileIO The file IO to use for reading manifest data files.
+     * @param instructions The instructions for customizations while reading.
+     */
+    public IcebergFlatLayout(
+            @NotNull final TableDefinition tableDef,
+            @NotNull final Table table,
+            @NotNull final Snapshot tableSnapshot,
+            @NotNull final FileIO fileIO,
+            @NotNull final IcebergInstructions instructions) {
+        super(tableDef, table, tableSnapshot, fileIO, instructions);
+    }
+
+    @Override
+    public String toString() {
+        return IcebergFlatLayout.class.getSimpleName() + '[' + table.name() + ']';
+    }
+
+    @Override
+    IcebergTableLocationKey keyFromDataFile(DataFile df, URI fileUri) {
+        return locationKey(df.format(), fileUri, null);
+    }
+}
diff --git a/...s/iceberg/src/main/java/io/deephaven/iceberg/layout/IcebergKeyValuePartitionedLayout.java b/...s/iceberg/src/main/java/io/deephaven/iceberg/layout/IcebergKeyValuePartitionedLayout.java
@@ -0,0 +1,104 @@
+//
+// Copyright (c) 2016-2024 Deephaven Data Labs and Patent Pending
+//
+package io.deephaven.iceberg.layout;
+
+import io.deephaven.engine.table.ColumnDefinition;
+import io.deephaven.engine.table.TableDefinition;
+import io.deephaven.engine.table.impl.locations.TableDataException;
+import io.deephaven.engine.table.impl.locations.impl.TableLocationKeyFinder;
+import io.deephaven.iceberg.location.IcebergTableLocationKey;
+import io.deephaven.iceberg.util.IcebergInstructions;
+import io.deephaven.util.type.TypeUtils;
+import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.iceberg.*;
+import org.apache.iceberg.io.FileIO;
+import org.jetbrains.annotations.NotNull;
+
+import java.net.URI;
+import java.util.*;
+import java.util.stream.Collectors;
+
+/**
+ * Iceberg {@link TableLocationKeyFinder location finder} for tables with partitions that will discover data files from
+ * a {@link Snapshot}
+ */
+public final class IcebergKeyValuePartitionedLayout extends IcebergBaseLayout {
+    private class ColumnData {
+        final String name;
+        final Class<?> type;
+        final int index;
+
+        public ColumnData(String name, Class<?> type, int index) {
+            this.name = name;
+            this.type = type;
+            this.index = index;
+        }
+    }
+
+    private final List<ColumnData> outputPartitioningColumns;
+
+    /**
+     * @param tableDef The {@link TableDefinition} that will be used for the table.
+     * @param table The {@link Table} to discover locations for.
+     * @param tableSnapshot The {@link Snapshot} from which to discover data files.
+     * @param fileIO The file IO to use for reading manifest data files.
+     * @param partitionSpec The Iceberg {@link PartitionSpec partition spec} for the table.
+     * @param instructions The instructions for customizations while reading.
+     */
+    public IcebergKeyValuePartitionedLayout(
+            @NotNull final TableDefinition tableDef,
+            @NotNull final org.apache.iceberg.Table table,
+            @NotNull final org.apache.iceberg.Snapshot tableSnapshot,
+            @NotNull final FileIO fileIO,
+            @NotNull final PartitionSpec partitionSpec,
+            @NotNull final IcebergInstructions instructions) {
+        super(tableDef, table, tableSnapshot, fileIO, instructions);
+
+        // We can assume due to upstream validation that there are no duplicate names (after renaming) that are included
+        // in the output definition, so we can ignore duplicates.
+        final MutableInt icebergIndex = new MutableInt(0);
+        final Map<String, Integer> availablePartitioningColumns = partitionSpec.fields().stream()
+                .map(PartitionField::name)
+                .map(name -> instructions.columnRenames().getOrDefault(name, name))
+                .collect(Collectors.toMap(
+                        name -> name,
+                        name -> icebergIndex.getAndIncrement(),
+                        (v1, v2) -> v1,
+                        LinkedHashMap::new));
+
+        outputPartitioningColumns = tableDef.getColumnStream()
+                .map((final ColumnDefinition<?> columnDef) -> {
+                    final Integer index = availablePartitioningColumns.get(columnDef.getName());
+                    if (index == null) {
+                        return null;
+                    }
+                    return new ColumnData(columnDef.getName(), TypeUtils.getBoxedType(columnDef.getDataType()), index);
+                })
+                .filter(Objects::nonNull)
+                .collect(Collectors.toList());
+    }
+
+    @Override
+    public String toString() {
+        return IcebergKeyValuePartitionedLayout.class.getSimpleName() + '[' + table.name() + ']';
+    }
+
+    @Override
+    IcebergTableLocationKey keyFromDataFile(DataFile df, URI fileUri) {
+        final Map<String, Comparable<?>> partitions = new LinkedHashMap<>();
+
+        final PartitionData partitionData = (PartitionData) df.partition();
+        for (final ColumnData colData : outputPartitioningColumns) {
+            final String colName = colData.name;
+            final Object colValue = partitionData.get(colData.index);
+            if (colValue != null && !colData.type.isAssignableFrom(colValue.getClass())) {
+                throw new TableDataException("Partitioning column " + colName
+                        + " has type " + colValue.getClass().getName()
+                        + " but expected " + colData.type.getName());
+            }
+            partitions.put(colName, (Comparable<?>) colValue);
+        }
+        return locationKey(df.format(), fileUri, partitions);
+    }
+}