From 6ea15871f0d319a7ae123445a322d42c1c78319a Mon Sep 17 00:00:00 2001 From: Shivam Malhotra Date: Wed, 27 Dec 2023 22:58:49 +0530 Subject: [PATCH] Sample commit for how URIs in place of files and strings look like --- .../locations/local/FileTableLocationKey.java | 22 +- .../engine/table/impl/TestCodecColumns.java | 7 +- .../impl/TestKeyValuePartitionLayout.java | 40 +- .../parquet/base/ParquetFileReader.java | 17 +- .../parquet/table/ParquetSchemaReader.java | 12 +- .../deephaven/parquet/table/ParquetTools.java | 174 +- .../DeephavenNestedPartitionLayout.java | 2 +- .../layout/ParquetFlatPartitionedLayout.java | 2 +- .../ParquetKeyValuePartitionedLayout.java | 4 +- .../layout/ParquetMetadataFileLayout.java | 7 +- .../table/layout/ParquetSingleFileLayout.java | 14 +- .../table/location/ParquetColumnLocation.java | 35 +- .../table/location/ParquetTableLocation.java | 15 +- .../location/ParquetTableLocationFactory.java | 9 +- .../location/ParquetTableLocationKey.java | 23 +- .../table/ParquetTableReadWriteTest.java | 4623 +++++++++-------- 16 files changed, 2543 insertions(+), 2463 deletions(-) diff --git a/engine/table/src/main/java/io/deephaven/engine/table/impl/locations/local/FileTableLocationKey.java b/engine/table/src/main/java/io/deephaven/engine/table/impl/locations/local/FileTableLocationKey.java index da998585c20..69d8e47a057 100644 --- a/engine/table/src/main/java/io/deephaven/engine/table/impl/locations/local/FileTableLocationKey.java +++ b/engine/table/src/main/java/io/deephaven/engine/table/impl/locations/local/FileTableLocationKey.java @@ -12,6 +12,7 @@ import org.jetbrains.annotations.Nullable; import java.io.File; +import java.net.URI; import java.util.Map; /** @@ -24,7 +25,7 @@ public class FileTableLocationKey extends PartitionedTableLocationKey { private static final String IMPLEMENTATION_NAME = FileTableLocationKey.class.getSimpleName(); - protected final File file; + protected final URI parquetFileURI; private final int order; private int cachedHashCode; @@ -32,7 +33,8 @@ public class FileTableLocationKey extends PartitionedTableLocationKey { /** * Construct a new FileTableLocationKey for the supplied {@code file} and {@code partitions}. * - * @param file The file (or directory) that backs the keyed location. Will be adjusted to an absolute path. + * @param parquetFileURI The file (or directory) that backs the keyed location. Will be adjusted to an absolute + * path. * @param order Explicit ordering value for this location key. {@link Comparable#compareTo(Object)} will sort * FileTableLocationKeys with a lower {@code order} before other keys. Comparing this ordering value takes * precedence over other fields. @@ -41,21 +43,21 @@ public class FileTableLocationKey extends PartitionedTableLocationKey { * be made, so the calling code is free to mutate the map after this call completes, but the partition keys * and values themselves must be effectively immutable. */ - public FileTableLocationKey(@NotNull final File file, final int order, + public FileTableLocationKey(@NotNull final URI parquetFileURI, final int order, @Nullable final Map> partitions) { super(partitions); - this.file = file.getAbsoluteFile(); + this.parquetFileURI = parquetFileURI; this.order = order; } - public final File getFile() { - return file; + public final URI getURI() { + return parquetFileURI; } @Override public LogOutput append(@NotNull final LogOutput logOutput) { return logOutput.append(getImplementationName()) - .append(":[file=").append(file.getPath()) + .append(":[file=").append(parquetFileURI.toString()) .append(",partitions=").append(PartitionsFormatter.INSTANCE, partitions) .append(']'); } @@ -84,7 +86,7 @@ public int compareTo(@NotNull final TableLocationKey other) { if (partitionComparisonResult != 0) { return partitionComparisonResult; } - return file.compareTo(otherTyped.file); + return parquetFileURI.compareTo(otherTyped.parquetFileURI); } throw new ClassCastException("Cannot compare " + getClass() + " to " + other.getClass()); } @@ -92,7 +94,7 @@ public int compareTo(@NotNull final TableLocationKey other) { @Override public int hashCode() { if (cachedHashCode == 0) { - final int computedHashCode = 31 * partitions.hashCode() + file.hashCode(); + final int computedHashCode = 31 * partitions.hashCode() + parquetFileURI.hashCode(); // Don't use 0; that's used by StandaloneTableLocationKey, and also our sentinel for the need to compute if (computedHashCode == 0) { final int fallbackHashCode = FileTableLocationKey.class.hashCode(); @@ -113,7 +115,7 @@ public boolean equals(@Nullable final Object other) { return false; } final FileTableLocationKey otherTyped = (FileTableLocationKey) other; - return file.equals(otherTyped.file) && partitions.equals(otherTyped.partitions); + return parquetFileURI.equals(otherTyped.parquetFileURI) && partitions.equals(otherTyped.partitions); } @Override diff --git a/engine/table/src/test/java/io/deephaven/engine/table/impl/TestCodecColumns.java b/engine/table/src/test/java/io/deephaven/engine/table/impl/TestCodecColumns.java index 12e776eab0a..37878244364 100644 --- a/engine/table/src/test/java/io/deephaven/engine/table/impl/TestCodecColumns.java +++ b/engine/table/src/test/java/io/deephaven/engine/table/impl/TestCodecColumns.java @@ -23,6 +23,8 @@ import java.io.File; import java.io.IOException; import java.math.BigInteger; +import java.net.URI; +import java.net.URISyntaxException; import java.nio.file.Files; import java.nio.file.Paths; @@ -94,14 +96,15 @@ public void setUp() { } @Test - public void doColumnsTest() throws IOException { + public void doColumnsTest() throws IOException, URISyntaxException { final File dir = Files.createTempDirectory(Paths.get(""), "CODEC_TEST").toFile(); final File dest = new File(dir, "Test.parquet"); + final URI destURI = new URI(dest.toString()); try { ParquetTools.writeTable(table, dest, table.getDefinition(), writeInstructions); final MutableObject instructionsOut = new MutableObject<>(); final Table result = - ParquetTools.readParquetSchemaAndTable(dest, ParquetInstructions.EMPTY, instructionsOut); + ParquetTools.readParquetSchemaAndTable(destURI, ParquetInstructions.EMPTY, instructionsOut); TableTools.show(result); TestCase.assertEquals(TABLE_DEFINITION, result.getDefinition()); final ParquetInstructions readInstructions = instructionsOut.getValue(); diff --git a/engine/table/src/test/java/io/deephaven/engine/table/impl/locations/impl/TestKeyValuePartitionLayout.java b/engine/table/src/test/java/io/deephaven/engine/table/impl/locations/impl/TestKeyValuePartitionLayout.java index 6bcde5137ac..446a1e57b45 100644 --- a/engine/table/src/test/java/io/deephaven/engine/table/impl/locations/impl/TestKeyValuePartitionLayout.java +++ b/engine/table/src/test/java/io/deephaven/engine/table/impl/locations/impl/TestKeyValuePartitionLayout.java @@ -58,14 +58,14 @@ public void testFlat() throws IOException { final RecordingLocationKeyFinder recorder = new RecordingLocationKeyFinder<>(); new KeyValuePartitionLayout<>(dataDirectory, path -> true, () -> new LocationTableBuilderCsv(dataDirectory), - (path, partitions) -> new FileTableLocationKey(path.toFile(), 0, partitions), 0).findKeys(recorder); + (path, partitions) -> new FileTableLocationKey(path.toUri(), 0, partitions), 0).findKeys(recorder); final List results = recorder.getRecordedKeys().stream().sorted().collect(Collectors.toList()); TestCase.assertEquals(2, results.size()); - TestCase.assertEquals(file1.getAbsoluteFile(), results.get(0).getFile()); - TestCase.assertEquals(file2.getAbsoluteFile(), results.get(1).getFile()); + TestCase.assertEquals(file1.getAbsoluteFile(), results.get(0).getURI()); + TestCase.assertEquals(file2.getAbsoluteFile(), results.get(1).getURI()); TestCase.assertTrue(results.get(0).getPartitionKeys().isEmpty()); TestCase.assertTrue(results.get(1).getPartitionKeys().isEmpty()); @@ -82,14 +82,14 @@ public void testOneLevel() throws IOException { final RecordingLocationKeyFinder recorder = new RecordingLocationKeyFinder<>(); new KeyValuePartitionLayout<>(dataDirectory, path -> true, () -> new LocationTableBuilderCsv(dataDirectory), - (path, partitions) -> new FileTableLocationKey(path.toFile(), 0, partitions), 1).findKeys(recorder); + (path, partitions) -> new FileTableLocationKey(path.toUri(), 0, partitions), 1).findKeys(recorder); final List results = recorder.getRecordedKeys().stream().sorted().collect(Collectors.toList()); TestCase.assertEquals(2, results.size()); - TestCase.assertEquals(file2.getAbsoluteFile(), results.get(0).getFile()); - TestCase.assertEquals(file1.getAbsoluteFile(), results.get(1).getFile()); + TestCase.assertEquals(file2.getAbsoluteFile(), results.get(0).getURI()); + TestCase.assertEquals(file1.getAbsoluteFile(), results.get(1).getURI()); TestCase.assertEquals(1, results.get(0).getPartitionKeys().size()); TestCase.assertEquals(1, results.get(1).getPartitionKeys().size()); @@ -115,15 +115,15 @@ public void testThreeLevels() throws IOException { final RecordingLocationKeyFinder recorder = new RecordingLocationKeyFinder<>(); new KeyValuePartitionLayout<>(dataDirectory, path -> true, () -> new LocationTableBuilderCsv(dataDirectory), - (path, partitions) -> new FileTableLocationKey(path.toFile(), 0, partitions), 3).findKeys(recorder); + (path, partitions) -> new FileTableLocationKey(path.toUri(), 0, partitions), 3).findKeys(recorder); final List results = recorder.getRecordedKeys().stream().sorted().collect(Collectors.toList()); TestCase.assertEquals(3, results.size()); - TestCase.assertEquals(file2.getAbsoluteFile(), results.get(0).getFile()); - TestCase.assertEquals(file3.getAbsoluteFile(), results.get(1).getFile()); - TestCase.assertEquals(file1.getAbsoluteFile(), results.get(2).getFile()); + TestCase.assertEquals(file2.getAbsoluteFile(), results.get(0).getURI()); + TestCase.assertEquals(file3.getAbsoluteFile(), results.get(1).getURI()); + TestCase.assertEquals(file1.getAbsoluteFile(), results.get(2).getURI()); TestCase.assertEquals(3, results.get(0).getPartitionKeys().size()); TestCase.assertEquals(3, results.get(1).getPartitionKeys().size()); @@ -166,7 +166,7 @@ public void testTypesAndNameLegalization() throws IOException { for (final Supplier locationTableBuilderSupplier : locationTableBuilderSuppliers) { final TableLocationKeyFinder finder = new KeyValuePartitionLayout<>( dataDirectory, path -> true, locationTableBuilderSupplier, - (path, partitions) -> new FileTableLocationKey(path.toFile(), 0, partitions), 3); + (path, partitions) -> new FileTableLocationKey(path.toUri(), 0, partitions), 3); final RecordingLocationKeyFinder recorder1 = new RecordingLocationKeyFinder<>(); finder.findKeys(recorder1); @@ -180,9 +180,9 @@ public void testTypesAndNameLegalization() throws IOException { TestCase.assertEquals(3, results1.size()); - TestCase.assertEquals(file2.getAbsoluteFile(), results1.get(0).getFile()); - TestCase.assertEquals(file3.getAbsoluteFile(), results1.get(1).getFile()); - TestCase.assertEquals(file1.getAbsoluteFile(), results1.get(2).getFile()); + TestCase.assertEquals(file2.getAbsoluteFile(), results1.get(0).getURI()); + TestCase.assertEquals(file3.getAbsoluteFile(), results1.get(1).getURI()); + TestCase.assertEquals(file1.getAbsoluteFile(), results1.get(2).getURI()); TestCase.assertEquals(3, results1.get(0).getPartitionKeys().size()); TestCase.assertEquals(3, results1.get(1).getPartitionKeys().size()); @@ -219,7 +219,7 @@ public void testMaxDepthEmpty() throws IOException { final RecordingLocationKeyFinder recorder = new RecordingLocationKeyFinder<>(); new KeyValuePartitionLayout<>(dataDirectory, path -> true, () -> new LocationTableBuilderCsv(dataDirectory), - (path, partitions) -> new FileTableLocationKey(path.toFile(), 0, partitions), 3).findKeys(recorder); + (path, partitions) -> new FileTableLocationKey(path.toUri(), 0, partitions), 3).findKeys(recorder); final List results = recorder.getRecordedKeys().stream().sorted().collect(Collectors.toList()); @@ -247,15 +247,15 @@ public void testMaxDepth() throws IOException { final RecordingLocationKeyFinder recorder = new RecordingLocationKeyFinder<>(); new KeyValuePartitionLayout<>(dataDirectory, path -> true, () -> new LocationTableBuilderCsv(dataDirectory), - (path, partitions) -> new FileTableLocationKey(path.toFile(), 0, partitions), 3).findKeys(recorder); + (path, partitions) -> new FileTableLocationKey(path.toUri(), 0, partitions), 3).findKeys(recorder); final List results = recorder.getRecordedKeys().stream().sorted().collect(Collectors.toList()); TestCase.assertEquals(3, results.size()); - TestCase.assertEquals(file2.getAbsoluteFile(), results.get(0).getFile()); - TestCase.assertEquals(file3.getAbsoluteFile(), results.get(1).getFile()); - TestCase.assertEquals(file1.getAbsoluteFile(), results.get(2).getFile()); + TestCase.assertEquals(file2.getAbsoluteFile(), results.get(0).getURI()); + TestCase.assertEquals(file3.getAbsoluteFile(), results.get(1).getURI()); + TestCase.assertEquals(file1.getAbsoluteFile(), results.get(2).getURI()); } @Test @@ -275,7 +275,7 @@ public void testMismatch() throws IOException { try { new KeyValuePartitionLayout<>(dataDirectory, path -> true, () -> new LocationTableBuilderCsv(dataDirectory), - (path, partitions) -> new FileTableLocationKey(path.toFile(), 0, partitions), 3).findKeys(ftlk -> { + (path, partitions) -> new FileTableLocationKey(path.toUri(), 0, partitions), 3).findKeys(ftlk -> { }); TestCase.fail("Expected exception"); } catch (TableDataException expected) { diff --git a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java index fe6ef46df6d..708a4181616 100644 --- a/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java +++ b/extensions/parquet/base/src/main/java/io/deephaven/parquet/base/ParquetFileReader.java @@ -12,6 +12,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; +import java.net.URI; import java.nio.ByteBuffer; import java.nio.ByteOrder; import java.nio.channels.SeekableByteChannel; @@ -27,20 +28,24 @@ public class ParquetFileReader { private static final int FOOTER_LENGTH_SIZE = 4; private static final String MAGIC_STR = "PAR1"; static final byte[] MAGIC = MAGIC_STR.getBytes(StandardCharsets.US_ASCII); + public static final String S3_PARQUET_FILE_URI_SCHEME = "s3"; public final FileMetaData fileMetaData; private final SeekableChannelsProvider channelsProvider; private final Path rootPath; private final MessageType type; - public ParquetFileReader(final String filePath, final SeekableChannelsProvider channelsProvider) + public ParquetFileReader(final URI parquetFileURI, final SeekableChannelsProvider channelsProvider) throws IOException { this.channelsProvider = channelsProvider; - // Root path should be this file if a single file, else the parent directory for a metadata - // file - rootPath = - filePath.endsWith(".parquet") ? Paths.get(filePath) : Paths.get(filePath).getParent(); - + final String filePath = parquetFileURI.toString(); + // Root path should be this file if a single file, else the parent directory for a metadata file + if ((parquetFileURI.getScheme() != null && parquetFileURI.getScheme().equals(S3_PARQUET_FILE_URI_SCHEME)) + || parquetFileURI.getRawPath().endsWith(".parquet")) { + rootPath = Path.of(filePath); + } else { + rootPath = Paths.get(parquetFileURI).getParent(); + } final byte[] footer; try (final SeekableChannelsProvider.ChannelContext context = channelsProvider.makeContext(); final SeekableByteChannel readChannel = channelsProvider.getReadChannel(context, filePath)) { diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetSchemaReader.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetSchemaReader.java index d6bb66d7c10..760e26d52e6 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetSchemaReader.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetSchemaReader.java @@ -21,9 +21,10 @@ import org.apache.parquet.schema.PrimitiveType; import org.jetbrains.annotations.NotNull; -import java.io.File; import java.io.IOException; import java.math.BigInteger; +import java.net.URI; +import java.net.URISyntaxException; import java.time.Instant; import java.time.LocalDate; import java.time.LocalDateTime; @@ -98,8 +99,13 @@ public static ParquetInstructions readParquetSchema( @NotNull final ParquetInstructions readInstructions, @NotNull final ColumnDefinitionConsumer consumer, @NotNull final BiFunction, String> legalizeColumnNameFunc) throws IOException { - final ParquetFileReader parquetFileReader = - ParquetTools.getParquetFileReaderChecked(new File(filePath), readInstructions); + final ParquetFileReader parquetFileReader; + try { + parquetFileReader = + ParquetTools.getParquetFileReaderChecked(new URI(filePath), readInstructions); + } catch (final URISyntaxException e) { + throw new UncheckedDeephavenException("Failed to parse URI " + filePath, e); + } final ParquetMetadata parquetMetadata = new ParquetMetadataConverter().fromParquetMetadata(parquetFileReader.fileMetaData); return readParquetSchema(parquetFileReader.getSchema(), parquetMetadata.getFileMetaData().getKeyValueMetaData(), diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java index 80bec9ffa6e..53a2196b78a 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/ParquetTools.java @@ -7,7 +7,6 @@ import io.deephaven.base.ClassUtil; import io.deephaven.base.FileUtils; import io.deephaven.base.Pair; -import io.deephaven.base.verify.Assert; import io.deephaven.base.verify.Require; import io.deephaven.engine.context.ExecutionContext; import io.deephaven.engine.table.ColumnDefinition; @@ -48,12 +47,15 @@ import java.io.File; import java.io.IOException; +import java.net.URI; +import java.net.URISyntaxException; import java.nio.file.DirectoryStream; import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.attribute.BasicFileAttributes; import java.util.*; +import static io.deephaven.parquet.base.ParquetFileReader.S3_PARQUET_FILE_URI_SCHEME; import static io.deephaven.parquet.table.ParquetTableWriter.PARQUET_FILE_EXTENSION; import static io.deephaven.util.type.TypeUtils.getUnboxedTypeIfBoxed; @@ -79,7 +81,7 @@ private ParquetTools() {} * key} order) location found will be used to infer schema. * *

- * Delegates to one of {@link #readSingleFileTable(File, ParquetInstructions)}, + * Delegates to one of {@link #readSingleFileTable(URI, ParquetInstructions)}, * {@link #readPartitionedTableWithMetadata(File, ParquetInstructions)}, * {@link #readFlatPartitionedTable(File, ParquetInstructions)}, or * {@link #readKeyValuePartitionedTable(File, ParquetInstructions)}. @@ -92,27 +94,26 @@ private ParquetTools() {} * @see ParquetFlatPartitionedLayout */ public static Table readTable(@NotNull final String sourceFilePath) { - if (sourceFilePath.startsWith("s3:") && sourceFilePath.endsWith(PARQUET_FILE_EXTENSION)) { - // TODO This is hacky, because here URI is getting converted to a file path and // will change to / - // We need to keep this as a URI and internally check if its a file or S3 backed URI - return readSingleFileTable(new File(sourceFilePath), ParquetInstructions.EMPTY); + final URI sourceFileURI; + try { + sourceFileURI = new URI(sourceFilePath); + } catch (final URISyntaxException e) { + throw new UncheckedDeephavenException("Failed to parse URI " + sourceFilePath, e); } - return readTableInternal(new File(sourceFilePath), ParquetInstructions.EMPTY); + return readTableInternal(sourceFileURI, ParquetInstructions.EMPTY); } public static Table readTable(@NotNull final String sourceFilePath, @NotNull final ParquetInstructions readInstructions, @NotNull final TableDefinition tableDefinition) { - if (sourceFilePath.startsWith("s3:") && sourceFilePath.endsWith(PARQUET_FILE_EXTENSION)) { - final Object specialInstructions = readInstructions.getSpecialInstructions(); - Assert.instanceOf(specialInstructions, "specialInstructions", S3ParquetInstructions.class); - final S3ParquetInstructions s3Instructions = (S3ParquetInstructions) specialInstructions; - - // TODO This is hacky, because here URI is getting converted to a file path and // will change to / - // We need to keep this as a URI and internally check if its a file or S3 backed URI - return readSingleFileTable(new File(sourceFilePath), readInstructions, tableDefinition); + // TODO Remove this overload + final URI sourceFileURI; + try { + sourceFileURI = new URI(sourceFilePath); + } catch (final URISyntaxException e) { + throw new UncheckedDeephavenException("Failed to parse URI " + sourceFilePath, e); } - return readTableInternal(new File(sourceFilePath), readInstructions); + return readSingleFileTable(sourceFileURI, readInstructions, tableDefinition); } /** @@ -125,7 +126,7 @@ public static Table readTable(@NotNull final String sourceFilePath, * key} order) location found will be used to infer schema. * *

- * Delegates to one of {@link #readSingleFileTable(File, ParquetInstructions)}, + * Delegates to one of {@link #readSingleFileTable(URI, ParquetInstructions)}, * {@link #readPartitionedTableWithMetadata(File, ParquetInstructions)}, * {@link #readFlatPartitionedTable(File, ParquetInstructions)}, or * {@link #readKeyValuePartitionedTable(File, ParquetInstructions)}. @@ -141,12 +142,13 @@ public static Table readTable(@NotNull final String sourceFilePath, public static Table readTable( @NotNull final String sourceFilePath, @NotNull final ParquetInstructions readInstructions) { - if (sourceFilePath.startsWith("s3:") && sourceFilePath.endsWith(PARQUET_FILE_EXTENSION)) { - // TODO This is hacky, because here URI is getting converted to a file path and // will change to / - // We need to keep this as a URI and internally check if its a file or S3 backed URI - return readSingleFileTable(new File(sourceFilePath), readInstructions); + final URI sourceFileURI; + try { + sourceFileURI = new URI(sourceFilePath); + } catch (final URISyntaxException e) { + throw new UncheckedDeephavenException("Failed to parse URI " + sourceFilePath, e); } - return readTableInternal(new File(sourceFilePath), readInstructions); + return readTableInternal(sourceFileURI, readInstructions); } /** @@ -159,7 +161,7 @@ public static Table readTable( * key} order) location found will be used to infer schema. * *

- * Delegates to one of {@link #readSingleFileTable(File, ParquetInstructions)}, + * Delegates to one of {@link #readSingleFileTable(URI, ParquetInstructions)}, * {@link #readPartitionedTableWithMetadata(File, ParquetInstructions)}, * {@link #readFlatPartitionedTable(File, ParquetInstructions)}, or * {@link #readKeyValuePartitionedTable(File, ParquetInstructions)}. @@ -172,7 +174,13 @@ public static Table readTable( * @see ParquetFlatPartitionedLayout */ public static Table readTable(@NotNull final File sourceFile) { - return readTableInternal(sourceFile, ParquetInstructions.EMPTY); + final URI sourceFileURI; + try { + sourceFileURI = new URI(sourceFile.toString()); + } catch (final URISyntaxException e) { + throw new UncheckedDeephavenException("Failed to parse URI " + sourceFile, e); + } + return readTableInternal(sourceFileURI, ParquetInstructions.EMPTY); } /** @@ -185,7 +193,7 @@ public static Table readTable(@NotNull final File sourceFile) { * key} order) location found will be used to infer schema. * *

- * Delegates to one of {@link #readSingleFileTable(File, ParquetInstructions)}, + * Delegates to one of {@link #readSingleFileTable(URI, ParquetInstructions)}, * {@link #readPartitionedTableWithMetadata(File, ParquetInstructions)}, * {@link #readFlatPartitionedTable(File, ParquetInstructions)}, or * {@link #readKeyValuePartitionedTable(File, ParquetInstructions)}. @@ -201,7 +209,13 @@ public static Table readTable(@NotNull final File sourceFile) { public static Table readTable( @NotNull final File sourceFile, @NotNull final ParquetInstructions readInstructions) { - return readTableInternal(sourceFile, readInstructions); + final URI sourceFileURI; + try { + sourceFileURI = new URI(sourceFile.toString()); + } catch (final URISyntaxException e) { + throw new UncheckedDeephavenException("Failed to parse URI " + sourceFile, e); + } + return readTableInternal(sourceFileURI, readInstructions); } /** @@ -629,7 +643,7 @@ public static void deleteTable(File path) { * key} order) location found will be used to infer schema. * *

- * Delegates to one of {@link #readSingleFileTable(File, ParquetInstructions)}, + * Delegates to one of {@link #readSingleFileTable(URI, ParquetInstructions)}, * {@link #readPartitionedTableWithMetadata(File, ParquetInstructions)}, * {@link #readFlatPartitionedTable(File, ParquetInstructions)}, or * {@link #readKeyValuePartitionedTable(File, ParquetInstructions)}. @@ -639,31 +653,35 @@ public static void deleteTable(File path) { * @return A {@link Table} */ private static Table readTableInternal( - @NotNull final File source, + @NotNull final URI source, @NotNull final ParquetInstructions instructions) { - final Path sourcePath = source.toPath(); + if (source.getScheme() != null && source.getScheme().equals(S3_PARQUET_FILE_URI_SCHEME)) { + return readSingleFileTable(source, instructions); + } + final Path sourcePath = Path.of(source); if (!Files.exists(sourcePath)) { throw new TableDataException("Source file " + source + " does not exist"); } final String sourceFileName = sourcePath.getFileName().toString(); final BasicFileAttributes sourceAttr = readAttributes(sourcePath); + final File sourceFile = sourcePath.toFile(); if (sourceAttr.isRegularFile()) { if (sourceFileName.endsWith(PARQUET_FILE_EXTENSION)) { return readSingleFileTable(source, instructions); } if (sourceFileName.equals(ParquetMetadataFileLayout.METADATA_FILE_NAME)) { - return readPartitionedTableWithMetadata(source.getParentFile(), instructions); + return readPartitionedTableWithMetadata(sourceFile.getParentFile(), instructions); } if (sourceFileName.equals(ParquetMetadataFileLayout.COMMON_METADATA_FILE_NAME)) { - return readPartitionedTableWithMetadata(source.getParentFile(), instructions); + return readPartitionedTableWithMetadata(sourceFile.getParentFile(), instructions); } throw new TableDataException( - "Source file " + source + " does not appear to be a parquet file or metadata file"); + "Source file " + sourceFile + " does not appear to be a parquet file or metadata file"); } if (sourceAttr.isDirectory()) { final Path metadataPath = sourcePath.resolve(ParquetMetadataFileLayout.METADATA_FILE_NAME); if (Files.exists(metadataPath)) { - return readPartitionedTableWithMetadata(source, instructions); + return readPartitionedTableWithMetadata(sourceFile, instructions); } final Path firstEntryPath; // Ignore dot files while looking for the first entry @@ -680,10 +698,10 @@ private static Table readTableInternal( final String firstEntryFileName = firstEntryPath.getFileName().toString(); final BasicFileAttributes firstEntryAttr = readAttributes(firstEntryPath); if (firstEntryAttr.isDirectory() && firstEntryFileName.contains("=")) { - return readKeyValuePartitionedTable(source, instructions); + return readKeyValuePartitionedTable(sourceFile, instructions); } if (firstEntryAttr.isRegularFile() && firstEntryFileName.endsWith(PARQUET_FILE_EXTENSION)) { - return readFlatPartitionedTable(source, instructions); + return readFlatPartitionedTable(sourceFile, instructions); } throw new TableDataException("No recognized Parquet table layout found in " + source); } @@ -707,8 +725,8 @@ private static BasicFileAttributes readAttributes(@NotNull final Path path) { * Reads in a table from a single parquet file using the provided table definition. * *

- * Callers may prefer the simpler methods {@link #readSingleFileTable(File, ParquetInstructions)} or - * {@link #readSingleFileTable(File, ParquetInstructions, TableDefinition)}. + * Callers may prefer the simpler methods {@link #readSingleFileTable(URI, ParquetInstructions)} or + * {@link #readSingleFileTable(URI, ParquetInstructions, TableDefinition)}. * * @param tableLocationKey The {@link ParquetTableLocationKey location keys} to include * @param readInstructions Instructions for customizations while reading @@ -728,7 +746,7 @@ public static Table readSingleFileTable( new ParquetTableLocationFactory(readInstructions), null); return new SimpleSourceTable(tableDefinition.getWritable(), - "Read single parquet file from " + tableLocationKey.getFile(), + "Read single parquet file from " + tableLocationKey.getURI(), RegionedTableComponentFactoryImpl.INSTANCE, locationProvider, null); } @@ -946,23 +964,33 @@ public static Table readFlatPartitionedTable( tableDefinition); } - /** - * Creates a single table via the parquet {@code file} using the table definition derived from that {@code file}. - * - *

- * Callers wishing to be more explicit (for example, to skip some columns) may prefer to call - * {@link #readSingleFileTable(File, ParquetInstructions, TableDefinition)}. - * - * @param file the parquet file - * @param readInstructions the instructions for customizations while reading - * @return the table - * @see ParquetTableLocationKey#ParquetTableLocationKey(File, int, Map) - * @see #readSingleFileTable(ParquetTableLocationKey, ParquetInstructions, TableDefinition) - */ + // /** + // * Creates a single table via the parquet {@code file} using the table definition derived from that {@code file}. + // * + // *

+ // * Callers wishing to be more explicit (for example, to skip some columns) may prefer to call + // * {@link #readSingleFileTable(File, ParquetInstructions, TableDefinition)}. + // * + // * @param file the parquet file + // * @param readInstructions the instructions for customizations while reading + // * @return the table + // * @see ParquetTableLocationKey#ParquetTableLocationKey(File, int, Map) + // * @see #readSingleFileTable(ParquetTableLocationKey, ParquetInstructions, TableDefinition) + // */ + // public static Table readSingleFileTable( + // @NotNull final File file, + // @NotNull final ParquetInstructions readInstructions) { + // final ParquetSingleFileLayout keyFinder = new ParquetSingleFileLayout(file, readInstructions); + // final KnownLocationKeyFinder inferenceKeys = toKnownKeys(keyFinder); + // final Pair inference = infer(inferenceKeys, readInstructions); + // return readSingleFileTable(inferenceKeys.getFirstKey().orElseThrow(), inference.getSecond(), + // inference.getFirst()); + // } + public static Table readSingleFileTable( - @NotNull final File file, + @NotNull final URI parquetFileURI, @NotNull final ParquetInstructions readInstructions) { - final ParquetSingleFileLayout keyFinder = new ParquetSingleFileLayout(file, readInstructions); + final ParquetSingleFileLayout keyFinder = new ParquetSingleFileLayout(parquetFileURI, readInstructions); final KnownLocationKeyFinder inferenceKeys = toKnownKeys(keyFinder); final Pair inference = infer(inferenceKeys, readInstructions); return readSingleFileTable(inferenceKeys.getFirstKey().orElseThrow(), inference.getSecond(), @@ -972,18 +1000,19 @@ public static Table readSingleFileTable( /** * Creates a single table via the parquet {@code file} using the provided {@code tableDefinition}. * - * @param file the parquet file + * @param parquetFileURI the parquet file * @param readInstructions the instructions for customizations while reading * @param tableDefinition the table definition * @return the table - * @see ParquetTableLocationKey#ParquetTableLocationKey(File, int, Map) + * @see ParquetTableLocationKey#ParquetTableLocationKey(URI, int, Map, ParquetInstructions) * @see #readSingleFileTable(ParquetTableLocationKey, ParquetInstructions, TableDefinition) */ public static Table readSingleFileTable( - @NotNull final File file, + @NotNull final URI parquetFileURI, @NotNull final ParquetInstructions readInstructions, @NotNull final TableDefinition tableDefinition) { - return readSingleFileTable(new ParquetTableLocationKey(file, 0, null, readInstructions), readInstructions, + return readSingleFileTable(new ParquetTableLocationKey(parquetFileURI, 0, null, readInstructions), + readInstructions, tableDefinition); } @@ -1056,51 +1085,48 @@ private static ParquetSchemaReader.ColumnDefinitionConsumer makeSchemaReaderCons * Make a {@link ParquetFileReader} for the supplied {@link File}. Wraps {@link IOException} as * {@link TableDataException}. * - * @param parquetFile The {@link File} to read + * @param parquetFileURI The {@link File} to read * @param readInstructions the instructions for customizations while reading * @return The new {@link ParquetFileReader} */ - public static ParquetFileReader getParquetFileReader(@NotNull final File parquetFile, + public static ParquetFileReader getParquetFileReader(@NotNull final URI parquetFileURI, @NotNull final ParquetInstructions readInstructions) { try { - return getParquetFileReaderChecked(parquetFile, readInstructions); + return getParquetFileReaderChecked(parquetFileURI, readInstructions); } catch (IOException e) { - throw new TableDataException("Failed to create Parquet file reader: " + parquetFile, e); + throw new TableDataException("Failed to create Parquet file reader: " + parquetFileURI, e); } } /** * Make a {@link ParquetFileReader} for the supplied {@link File}. * - * @param parquetFile The {@link File} to read + * @param parquetFileURI The {@link File} to read * @return The new {@link ParquetFileReader} * @throws IOException if an IO exception occurs */ public static ParquetFileReader getParquetFileReaderChecked( - @NotNull final File parquetFile, + @NotNull final URI parquetFileURI, @NotNull final ParquetInstructions readInstructions) throws IOException { - final String absolutePath = parquetFile.getAbsolutePath(); - final String S3_MARKER = "s3:/"; - if (absolutePath.contains(S3_MARKER)) { - // TODO I am creating S3 URI back from the file path which is incorrect, should have passed URI only - final int index = absolutePath.indexOf(S3_MARKER); - final String s3uri = S3_MARKER + absolutePath.substring(index + S3_MARKER.length() - 1); - return new ParquetFileReader(absolutePath, + if (parquetFileURI.getScheme() != null && parquetFileURI.getScheme().equals(S3_PARQUET_FILE_URI_SCHEME)) { + return new ParquetFileReader(parquetFileURI, new CachedChannelProvider( - new S3SeekableChannelProvider(readInstructions.getAwsRegionName(), s3uri), 1 << 7)); + new S3SeekableChannelProvider(readInstructions.getAwsRegionName(), + parquetFileURI.toString()), + 1 << 7)); } return new ParquetFileReader( - parquetFile.getAbsolutePath(), + parquetFileURI, new CachedChannelProvider( new TrackedSeekableChannelsProvider(TrackedFileHandleFactory.getInstance()), 1 << 7)); } @VisibleForTesting public static Table readParquetSchemaAndTable( - @NotNull final File source, @NotNull final ParquetInstructions readInstructionsIn, + @NotNull final URI sourceURI, @NotNull final ParquetInstructions readInstructionsIn, MutableObject instructionsOut) { final ParquetTableLocationKey tableLocationKey = - new ParquetTableLocationKey(source, 0, null, readInstructionsIn); + new ParquetTableLocationKey(sourceURI, 0, null, readInstructionsIn); final Pair>, ParquetInstructions> schemaInfo = convertSchema( tableLocationKey.getFileReader().getSchema(), tableLocationKey.getMetadata().getFileMetaData().getKeyValueMetaData(), diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/DeephavenNestedPartitionLayout.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/DeephavenNestedPartitionLayout.java index e9699e2deab..473357b2bc8 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/DeephavenNestedPartitionLayout.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/DeephavenNestedPartitionLayout.java @@ -52,7 +52,7 @@ public static DeephavenNestedPartitionLayout forParquet @Override protected ParquetTableLocationKey makeKey(@NotNull Path tableLeafDirectory, @NotNull Map> partitions) { - return new ParquetTableLocationKey(tableLeafDirectory.resolve(PARQUET_FILE_NAME).toFile(), 0, + return new ParquetTableLocationKey(tableLeafDirectory.resolve(PARQUET_FILE_NAME).toUri(), 0, partitions, readInstructions); } }; diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFlatPartitionedLayout.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFlatPartitionedLayout.java index bbaefc5d971..c87abeb8718 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFlatPartitionedLayout.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetFlatPartitionedLayout.java @@ -24,7 +24,7 @@ public final class ParquetFlatPartitionedLayout implements TableLocationKeyFinder { private static ParquetTableLocationKey locationKey(Path path, @NotNull final ParquetInstructions readInstructions) { - return new ParquetTableLocationKey(path.toFile(), 0, null, readInstructions); + return new ParquetTableLocationKey(path.toUri(), 0, null, readInstructions); } private final File tableRootDirectory; diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetKeyValuePartitionedLayout.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetKeyValuePartitionedLayout.java index b1ea92a4b70..fcff012e432 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetKeyValuePartitionedLayout.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetKeyValuePartitionedLayout.java @@ -29,7 +29,7 @@ public ParquetKeyValuePartitionedLayout( super(tableRootDirectory, ParquetFileHelper::fileNameMatches, () -> new LocationTableBuilderDefinition(tableDefinition), - (path, partitions) -> new ParquetTableLocationKey(path.toFile(), 0, partitions, readInstructions), + (path, partitions) -> new ParquetTableLocationKey(path.toUri(), 0, partitions, readInstructions), Math.toIntExact(tableDefinition.getColumnStream().filter(ColumnDefinition::isPartitioning).count())); } @@ -40,7 +40,7 @@ public ParquetKeyValuePartitionedLayout( super(tableRootDirectory, ParquetFileHelper::fileNameMatches, () -> new LocationTableBuilderCsv(tableRootDirectory), - (path, partitions) -> new ParquetTableLocationKey(path.toFile(), 0, partitions, readInstructions), + (path, partitions) -> new ParquetTableLocationKey(path.toUri(), 0, partitions, readInstructions), maxPartitioningLevels); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetMetadataFileLayout.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetMetadataFileLayout.java index a7ed9a754c3..2850e816a6e 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetMetadataFileLayout.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetMetadataFileLayout.java @@ -94,7 +94,8 @@ public ParquetMetadataFileLayout( if (!metadataFile.exists()) { throw new TableDataException(String.format("Parquet metadata file %s does not exist", metadataFile)); } - final ParquetFileReader metadataFileReader = ParquetTools.getParquetFileReader(metadataFile, inputInstructions); + final ParquetFileReader metadataFileReader = + ParquetTools.getParquetFileReader(metadataFile.toURI(), inputInstructions); final ParquetMetadataConverter converter = new ParquetMetadataConverter(); final ParquetMetadata metadataFileMetadata = convertMetadata(metadataFile, metadataFileReader, converter); @@ -105,7 +106,7 @@ public ParquetMetadataFileLayout( if (commonMetadataFile != null && commonMetadataFile.exists()) { final ParquetFileReader commonMetadataFileReader = - ParquetTools.getParquetFileReader(commonMetadataFile, inputInstructions); + ParquetTools.getParquetFileReader(commonMetadataFile.toURI(), inputInstructions); final Pair>, ParquetInstructions> fullSchemaInfo = ParquetTools.convertSchema( commonMetadataFileReader.getSchema(), convertMetadata(commonMetadataFile, commonMetadataFileReader, converter).getFileMetaData() @@ -196,7 +197,7 @@ public ParquetMetadataFileLayout( partitions.put(partitionKey, partitionValue); } } - final ParquetTableLocationKey tlk = new ParquetTableLocationKey(new File(directory, filePathString), + final ParquetTableLocationKey tlk = new ParquetTableLocationKey(new File(directory, filePathString).toURI(), partitionOrder.getAndIncrement(), partitions, inputInstructions); tlk.setFileReader(metadataFileReader); tlk.setMetadata(metadataFileMetadata); diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetSingleFileLayout.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetSingleFileLayout.java index 8fb14b9ad42..f4a330389fd 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetSingleFileLayout.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/layout/ParquetSingleFileLayout.java @@ -9,32 +9,32 @@ import org.jetbrains.annotations.NotNull; import java.io.File; +import java.net.URI; import java.util.function.Consumer; /** * Parquet {@link TableLocationKeyFinder location finder} that will discover a single file. */ public final class ParquetSingleFileLayout implements TableLocationKeyFinder { - - private final File parquetFile; + private final URI parquetFileUri; private final ParquetInstructions readInstructions; /** - * @param parquetFile The single parquet file to find + * @param parquetFileUri URI of single parquet file to find * @param readInstructions the instructions for customizations while reading */ - public ParquetSingleFileLayout(@NotNull final File parquetFile, + public ParquetSingleFileLayout(@NotNull final URI parquetFileUri, @NotNull final ParquetInstructions readInstructions) { - this.parquetFile = parquetFile; + this.parquetFileUri = parquetFileUri; this.readInstructions = readInstructions; } public String toString() { - return ParquetSingleFileLayout.class.getSimpleName() + '[' + parquetFile + ']'; + return ParquetSingleFileLayout.class.getSimpleName() + '[' + parquetFileUri + ']'; } @Override public void findKeys(@NotNull final Consumer locationKeyObserver) { - locationKeyObserver.accept(new ParquetTableLocationKey(parquetFile, 0, null, readInstructions)); + locationKeyObserver.accept(new ParquetTableLocationKey(parquetFileUri, 0, null, readInstructions)); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnLocation.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnLocation.java index 6c61c9276e5..8e8f4374a0b 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnLocation.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetColumnLocation.java @@ -3,6 +3,7 @@ */ package io.deephaven.parquet.table.location; +import io.deephaven.UncheckedDeephavenException; import io.deephaven.base.verify.Assert; import io.deephaven.base.verify.Require; import io.deephaven.chunk.*; @@ -50,6 +51,8 @@ import java.io.UncheckedIOException; import java.math.BigDecimal; import java.math.BigInteger; +import java.net.URI; +import java.net.URISyntaxException; import java.util.*; import java.util.function.Function; import java.util.function.LongFunction; @@ -58,6 +61,7 @@ import java.util.stream.Stream; import static io.deephaven.engine.table.impl.sources.regioned.RegionedColumnSource.ROW_KEY_TO_SUB_REGION_ROW_INDEX_MASK; +import static io.deephaven.parquet.base.ParquetFileReader.S3_PARQUET_FILE_URI_SCHEME; import static io.deephaven.parquet.table.ParquetTableWriter.*; final class ParquetColumnLocation extends AbstractColumnLocation { @@ -150,8 +154,8 @@ private ParquetTableLocation tl() { /** * Helper method for logging a warning on failure in reading an index file */ - private void logWarnFailedToRead(final String indexFilePath) { - log.warn().append("Failed to read expected index file ").append(indexFilePath) + private void logWarnFailedToRead(final URI indexFileURI) { + log.warn().append("Failed to read expected index file ").append(indexFileURI.toString()) .append(" for table location ").append(tl()).append(", column ") .append(getName()) .endl(); @@ -163,26 +167,30 @@ public METADATA_TYPE getMetadata(@NotNull final ColumnDefinition if (!hasGroupingTable) { return null; } - final File parquetFile = tl().getParquetFile(); + final URI parquetFileURI = tl().getParquetFile(); + if (parquetFileURI.getScheme() != null && parquetFileURI.getScheme().equals(S3_PARQUET_FILE_URI_SCHEME)) { + throw new UncheckedDeephavenException("Parquet files in S3 are not expected to have indexing files"); + } + final File parquetFile = new File(parquetFileURI.toString()); try { ParquetFileReader parquetFileReader; - final String indexFilePath; + final URI indexFileURI; final GroupingColumnInfo groupingColumnInfo = tl().getGroupingColumns().get(parquetColumnName); if (groupingColumnInfo != null) { final String indexFileRelativePath = groupingColumnInfo.groupingTablePath(); - indexFilePath = parquetFile.toPath().getParent().resolve(indexFileRelativePath).toString(); + indexFileURI = new URI(parquetFile.toPath().getParent().resolve(indexFileRelativePath).toString()); try { - parquetFileReader = new ParquetFileReader(indexFilePath, tl().getChannelProvider()); + parquetFileReader = new ParquetFileReader(indexFileURI, tl().getChannelProvider()); } catch (final RuntimeException e) { - logWarnFailedToRead(indexFilePath); + logWarnFailedToRead(indexFileURI); return null; } } else { final String relativeIndexFilePath = ParquetTools.getRelativeIndexFilePath(parquetFile, parquetColumnName); - indexFilePath = parquetFile.toPath().getParent().resolve(relativeIndexFilePath).toString(); + indexFileURI = new URI(parquetFile.toPath().getParent().resolve(relativeIndexFilePath).toString()); try { - parquetFileReader = new ParquetFileReader(indexFilePath, tl().getChannelProvider()); + parquetFileReader = new ParquetFileReader(indexFileURI, tl().getChannelProvider()); } catch (final RuntimeException e1) { // Retry with legacy grouping file path final String legacyGroupingFileName = @@ -190,9 +198,10 @@ public METADATA_TYPE getMetadata(@NotNull final ColumnDefinition final File legacyGroupingFile = new File(parquetFile.getParent(), legacyGroupingFileName); try { parquetFileReader = - new ParquetFileReader(legacyGroupingFile.getAbsolutePath(), tl().getChannelProvider()); + new ParquetFileReader(new URI(legacyGroupingFile.getAbsolutePath()), + tl().getChannelProvider()); } catch (final RuntimeException e2) { - logWarnFailedToRead(indexFilePath); + logWarnFailedToRead(indexFileURI); return null; } } @@ -212,7 +221,7 @@ public METADATA_TYPE getMetadata(@NotNull final ColumnDefinition final ColumnChunkReader endPosReader = rowGroupReader.getColumnChunk(Collections.singletonList(END_POS)); if (groupingKeyReader == null || beginPosReader == null || endPosReader == null) { - log.warn().append("Index file ").append(indexFilePath) + log.warn().append("Index file ").append(indexFileURI.toString()) .append(" is missing one or more expected columns for table location ") .append(tl()).append(", column ").append(getName()); return null; @@ -244,6 +253,8 @@ public METADATA_TYPE getMetadata(@NotNull final ColumnDefinition .get(); } catch (IOException e) { throw new UncheckedIOException(e); + } catch (URISyntaxException e) { + throw new UncheckedDeephavenException(e); } } diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java index 88d2ae87acc..897017b7766 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocation.java @@ -26,9 +26,12 @@ import org.jetbrains.annotations.NotNull; import java.io.File; +import java.net.URI; import java.util.*; import java.util.stream.IntStream; +import static io.deephaven.parquet.base.ParquetFileReader.S3_PARQUET_FILE_URI_SCHEME; + public class ParquetTableLocation extends AbstractTableLocation { private static final String IMPLEMENTATION_NAME = ParquetColumnLocation.class.getSimpleName(); @@ -87,7 +90,13 @@ public ParquetTableLocation(@NotNull final TableKey tableKey, columnTypes = tableInfo.map(TableInfo::columnTypeMap).orElse(Collections.emptyMap()); version = tableInfo.map(TableInfo::version).orElse(null); - handleUpdate(computeIndex(), tableLocationKey.getFile().lastModified()); + final String uriScheme = tableLocationKey.getURI().getScheme(); + if (uriScheme != null && uriScheme.equals(S3_PARQUET_FILE_URI_SCHEME)) { + handleUpdate(computeIndex(), 0L); // TODO What should I put here? + } else { + handleUpdate(computeIndex(), new File(tableLocationKey.getURI().toString()).lastModified()); + } + } @Override @@ -98,8 +107,8 @@ public String getImplementationName() { @Override public void refresh() {} - File getParquetFile() { - return ((ParquetTableLocationKey) getKey()).getFile(); + URI getParquetFile() { + return ((ParquetTableLocationKey) getKey()).getURI(); } ParquetInstructions getReadInstructions() { diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationFactory.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationFactory.java index 5dc3d051903..09dd646b7ba 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationFactory.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationFactory.java @@ -13,6 +13,9 @@ import org.jetbrains.annotations.Nullable; import java.io.File; +import java.net.URI; + +import static io.deephaven.parquet.base.ParquetFileReader.S3_PARQUET_FILE_URI_SCHEME; /** * {@link TableLocationFactory} for {@link ParquetTableLocation}s. @@ -30,9 +33,9 @@ public ParquetTableLocationFactory(@NotNull final ParquetInstructions readInstru public TableLocation makeLocation(@NotNull final TableKey tableKey, @NotNull final ParquetTableLocationKey locationKey, @Nullable final TableDataRefreshService refreshService) { - final File parquetFile = locationKey.getFile(); - // TODO Again hacky, need to keep a URI and check if its a file or not and then do existence check - if (parquetFile.getAbsolutePath().contains("s3:/") || parquetFile.exists()) { + final URI parquetFileURI = locationKey.getURI(); + if ((parquetFileURI.getScheme() != null && parquetFileURI.getScheme().equals(S3_PARQUET_FILE_URI_SCHEME)) + || new File(parquetFileURI.toString()).exists()) { return new ParquetTableLocation(tableKey, locationKey, readInstructions); } else { return new NonexistentTableLocation(tableKey, locationKey); diff --git a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java index fcb6250203b..a582ecb394d 100644 --- a/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java +++ b/extensions/parquet/table/src/main/java/io/deephaven/parquet/table/location/ParquetTableLocationKey.java @@ -18,6 +18,7 @@ import java.io.File; import java.io.IOException; +import java.net.URI; import java.util.List; import java.util.Map; import java.util.stream.IntStream; @@ -37,25 +38,25 @@ public class ParquetTableLocationKey extends FileTableLocationKey { /** * Construct a new ParquetTableLocationKey for the supplied {@code file} and {@code partitions}. * - * @param file The parquet file that backs the keyed location. Will be adjusted to an absolute path. + * @param parquetFileUri The parquet file that backs the keyed location. Will be adjusted to an absolute path. * @param order Explicit ordering index, taking precedence over other fields * @param partitions The table partitions enclosing the table location keyed by {@code this}. Note that if this * parameter is {@code null}, the location will be a member of no partitions. An ordered copy of the map will * be made, so the calling code is free to mutate the map after this call * @param readInstructions the instructions for customizations while reading */ - public ParquetTableLocationKey(@NotNull final File file, final int order, + public ParquetTableLocationKey(@NotNull final URI parquetFileUri, final int order, @Nullable final Map> partitions, @NotNull final ParquetInstructions readInstructions) { - super(validateParquetFile(file), order, partitions); + super(validateParquetFile(parquetFileUri), order, partitions); this.readInstructions = readInstructions; } - private static File validateParquetFile(@NotNull final File file) { - if (!file.getName().endsWith(ParquetTableWriter.PARQUET_FILE_EXTENSION)) { + private static URI validateParquetFile(@NotNull final URI parquetFileUri) { + if (!parquetFileUri.getRawPath().endsWith(ParquetTableWriter.PARQUET_FILE_EXTENSION)) { throw new IllegalArgumentException("Parquet file must end in " + ParquetTableWriter.PARQUET_FILE_EXTENSION); } - return file; + return parquetFileUri; } @Override @@ -77,7 +78,7 @@ public String getImplementationName() { * * * Callers wishing to handle these cases more explicit may call - * {@link ParquetTools#getParquetFileReaderChecked(File, ParquetInstructions)}. + * {@link ParquetTools#getParquetFileReaderChecked(URI, ParquetInstructions)}. * * @return true if the file reader exists or was successfully created */ @@ -86,7 +87,7 @@ public synchronized boolean verifyFileReader() { return true; } try { - fileReader = ParquetTools.getParquetFileReaderChecked(file, readInstructions); + fileReader = ParquetTools.getParquetFileReaderChecked(parquetFileURI, readInstructions); } catch (IOException e) { return false; } @@ -103,7 +104,7 @@ public synchronized ParquetFileReader getFileReader() { if (fileReader != null) { return fileReader; } - return fileReader = ParquetTools.getParquetFileReader(file, readInstructions); + return fileReader = ParquetTools.getParquetFileReader(parquetFileURI, readInstructions); } /** @@ -132,7 +133,7 @@ public synchronized ParquetMetadata getMetadata() { try { return metadata = new ParquetMetadataConverter().fromParquetMetadata(getFileReader().fileMetaData); } catch (IOException e) { - throw new TableDataException("Failed to convert Parquet file metadata: " + getFile(), e); + throw new TableDataException("Failed to convert Parquet file metadata: " + getURI(), e); } } @@ -167,7 +168,7 @@ public synchronized int[] getRowGroupIndices() { // we're not expecting that in this code path. To support it, discovery tools should figure out // the row groups for a partition themselves and call setRowGroupReaders. final String filePath = rowGroups.get(rgi).getColumns().get(0).getFile_path(); - return filePath == null || new File(filePath).getAbsoluteFile().equals(file); + return filePath == null || new File(filePath).getAbsoluteFile().equals(parquetFileURI.toString()); }).toArray(); } diff --git a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java index 57805742e18..c67ba23edd9 100644 --- a/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java +++ b/extensions/parquet/table/src/test/java/io/deephaven/parquet/table/ParquetTableReadWriteTest.java @@ -64,6 +64,9 @@ import java.io.Serializable; import java.math.BigDecimal; import java.math.BigInteger; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; @@ -378,7 +381,7 @@ public void testParquetLz4CompressionCodec() { } @Test - public void test_lz4_compressed() { + public void test_lz4_compressed() throws URISyntaxException { // Write and read a LZ4 compressed file File dest = new File(rootFile + File.separator + "Table.parquet"); final Table table = getTableFlat(100, false, false); @@ -390,11 +393,11 @@ public void test_lz4_compressed() { // The following file is tagged as LZ4 compressed based on its metadata, but is actually compressed with // LZ4_RAW. We should be able to read it anyway with no exceptions. String path = TestParquetTools.class.getResource("/sample_lz4_compressed.parquet").getFile(); - readSingleFileTable(new File(path), EMPTY).select(); + readSingleFileTable(new URI(path), EMPTY).select(); } catch (RuntimeException e) { TestCase.fail("Failed to read parquet file sample_lz4_compressed.parquet"); } - File randomDest = new File(rootFile, "random.parquet"); + final File randomDest = new File(rootFile, "random.parquet"); writeTable(fromDisk, randomDest, ParquetTools.LZ4_RAW); // Read the LZ4 compressed file again, to make sure we use a new adapter @@ -430,7 +433,7 @@ public void testParquetSnappyCompressionCodec() { } @Test - public void testBigDecimalPrecisionScale() { + public void testBigDecimalPrecisionScale() throws URISyntaxException { // https://github.com/deephaven/deephaven-core/issues/3650 final BigDecimal myBigDecimal = new BigDecimal(".0005"); assertEquals(1, myBigDecimal.precision()); @@ -438,7 +441,7 @@ public void testBigDecimalPrecisionScale() { final Table table = newTable(new ColumnHolder<>("MyBigDecimal", BigDecimal.class, null, false, myBigDecimal)); final File dest = new File(rootFile, "ParquetTest_testBigDecimalPrecisionScale.parquet"); writeTable(table, dest); - final Table fromDisk = readSingleFileTable(dest, EMPTY); + final Table fromDisk = readSingleFileTable(new URI(dest.toString()), EMPTY); try (final CloseableIterator it = fromDisk.objectColumnIterator("MyBigDecimal")) { assertTrue(it.hasNext()); final BigDecimal item = it.next(); @@ -507,7 +510,7 @@ private static Table arrayToVectorTable(final Table table) { } @Test - public void testArrayColumns() { + public void testArrayColumns() throws URISyntaxException { ArrayList columns = new ArrayList<>(Arrays.asList( "someStringArrayColumn = new String[] {i % 10 == 0 ? null : (`` + (i % 101))}", @@ -554,7 +557,8 @@ public void testArrayColumns() { writeReadTableTest(arrayTable, dest, writeInstructions); // Make sure the column didn't use dictionary encoding - ParquetMetadata metadata = new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); + ParquetMetadata metadata = + new ParquetTableLocationKey(new URI(dest.toString()), 0, null, ParquetInstructions.EMPTY).getMetadata(); String firstColumnMetadata = metadata.getBlocks().get(0).getColumns().get(0).toString(); assertTrue(firstColumnMetadata.contains("someStringArrayColumn") && !firstColumnMetadata.contains("RLE_DICTIONARY")); @@ -563,14 +567,15 @@ public void testArrayColumns() { writeReadTableTest(vectorTable, dest, writeInstructions); // Make sure the column didn't use dictionary encoding - metadata = new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); + metadata = + new ParquetTableLocationKey(new URI(dest.toString()), 0, null, ParquetInstructions.EMPTY).getMetadata(); firstColumnMetadata = metadata.getBlocks().get(0).getColumns().get(0).toString(); assertTrue(firstColumnMetadata.contains("someStringArrayColumn") && !firstColumnMetadata.contains("RLE_DICTIONARY")); } @Test - public void readLongParquetFileFromS3Test() { + public void readLongParquetFileFromS3Test() throws URISyntaxException { final ParquetInstructions readInstructions = new ParquetInstructions.Builder() .setAwsRegionName("us-east-2") .build(); @@ -595,7 +600,7 @@ public void readLongParquetFileFromS3Test() { "s3://aws-public-blockchain/v1.0/btc/transactions/date=2023-11-13/part-00000-da3a3c27-700d-496d-9c41-81281388eca8-c000.snappy.parquet", readInstructions, tableDefinition).select(); final Table fromDisk1 = ParquetTools.readSingleFileTable( - new File( + new URI( "/Users/shivammalhotra/Documents/part-00000-da3a3c27-700d-496d-9c41-81281388eca8-c000.snappy.parquet"), ParquetTools.SNAPPY, tableDefinition).select(); @@ -603,7 +608,7 @@ public void readLongParquetFileFromS3Test() { } @Test - public void readRefParquetFileFromS3Test() { + public void readRefParquetFileFromS3Test() throws URISyntaxException { final ParquetInstructions readInstructions = new ParquetInstructions.Builder() .setAwsRegionName("us-east-2") .build(); @@ -627,7 +632,7 @@ public void readRefParquetFileFromS3Test() { "s3://aws-public-blockchain/v1.0/btc/transactions/date=2009-01-03/part-00000-bdd84ab2-82e9-4a79-8212-7accd76815e8-c000.snappy.parquet", readInstructions, tableDefinition).head(5).select(); final Table fromDisk1 = ParquetTools.readSingleFileTable( - new File( + new URI( "/Users/shivammalhotra/Documents/part-00000-bdd84ab2-82e9-4a79-8212-7accd76815e8-c000.snappy.parquet"), ParquetTools.SNAPPY, tableDefinition).head(5).select(); @@ -637,7 +642,7 @@ public void readRefParquetFileFromS3Test() { "s3://aws-public-blockchain/v1.0/btc/transactions/date=2023-11-13/part-00000-da3a3c27-700d-496d-9c41-81281388eca8-c000.snappy.parquet", readInstructions, tableDefinition).head(5).select(); final Table fromDisk2 = ParquetTools.readSingleFileTable( - new File( + new URI( "/Users/shivammalhotra/Documents/part-00000-da3a3c27-700d-496d-9c41-81281388eca8-c000.snappy.parquet"), ParquetTools.SNAPPY, tableDefinition).head(5).select(); @@ -645,7 +650,7 @@ public void readRefParquetFileFromS3Test() { } @Test - public void readRefParquetFileLocally() { + public void readRefParquetFileLocally() throws URISyntaxException { final TableDefinition tableDefinition = TableDefinition.of( ColumnDefinition.ofString("hash"), ColumnDefinition.ofLong("version"), @@ -663,7 +668,7 @@ public void readRefParquetFileLocally() { ColumnDefinition.ofDouble("input_value")); final Table fromAws1 = ParquetTools.readSingleFileTable( - new File( + new URI( "/Users/shivammalhotra/Documents/part-00000-da3a3c27-700d-496d-9c41-81281388eca8-c000.snappy.parquet"), // new File( // "/Users/shivammalhotra/Documents/part-00000-bdd84ab2-82e9-4a79-8212-7accd76815e8-c000.snappy.parquet"), @@ -807,7 +812,7 @@ private Table maybeFixBigDecimal(Table toFix) { .dropColumns("bdColE"); } - private static Table readParquetFileFromGitLFS(final File dest) { + private static Table readParquetFileFromGitLFS(final URI dest) { try { return readSingleFileTable(dest, EMPTY); } catch (final RuntimeException e) { @@ -846,18 +851,18 @@ private static Table readParquetFileFromGitLFS(final File dest) { * */ @Test - public void testReadOldParquetData() { + public void testReadOldParquetData() throws URISyntaxException { String path = ParquetTableReadWriteTest.class.getResource("/ReferenceParquetData.parquet").getFile(); - readParquetFileFromGitLFS(new File(path)).select(); + readParquetFileFromGitLFS(new URI(path)).select(); final ParquetMetadata metadata = - new ParquetTableLocationKey(new File(path), 0, null, ParquetInstructions.EMPTY).getMetadata(); + new ParquetTableLocationKey(new URI(path), 0, null, ParquetInstructions.EMPTY).getMetadata(); assertTrue(metadata.getFileMetaData().getKeyValueMetaData().get("deephaven").contains("\"version\":\"0.4.0\"")); path = ParquetTableReadWriteTest.class.getResource("/ReferenceParquetVectorData.parquet").getFile(); - readParquetFileFromGitLFS(new File(path)).select(); + readParquetFileFromGitLFS(new URI(path)).select(); path = ParquetTableReadWriteTest.class.getResource("/ReferenceParquetArrayData.parquet").getFile(); - readParquetFileFromGitLFS(new File(path)).select(); + readParquetFileFromGitLFS(new URI(path)).select(); } @Test @@ -895,17 +900,17 @@ public void testVersionChecks() { * */ @Test - public void testReadingParquetFilesWithDifferentPageSizes() { + public void testReadingParquetFilesWithDifferentPageSizes() throws URISyntaxException { Table expected = TableTools.emptyTable(100).update( "intArrays = java.util.stream.IntStream.range(0, i).toArray()").reverse(); String path = ParquetTableReadWriteTest.class .getResource("/ReferenceParquetFileWithDifferentPageSizes1.parquet").getFile(); - Table fromDisk = readParquetFileFromGitLFS(new File(path)); + Table fromDisk = readParquetFileFromGitLFS(new URI(path)); assertTableEquals(expected, fromDisk); path = ParquetTableReadWriteTest.class .getResource("/ReferenceParquetFileWithDifferentPageSizes2.parquet").getFile(); - fromDisk = readParquetFileFromGitLFS(new File(path)); + fromDisk = readParquetFileFromGitLFS(new URI(path)); // Access something on the last page to make sure we can read it final int[] data = (int[]) fromDisk.getColumnSource("intArrays").get(998); @@ -1073,2286 +1078,2288 @@ public void writeMultiTableExceptionTest() { // All files should be deleted even though first table would be written successfully assertTrue(parentDir.list().length == 0); } - - /** - * These are tests for writing to a table with grouping columns to a parquet file and making sure there are no - * unnecessary files left in the directory after we finish writing. - */ - @Test - public void groupingColumnsBasicWriteTests() { - groupingColumnsBasicWriteTestsImpl(SINGLE_WRITER); - groupingColumnsBasicWriteTestsImpl(MULTI_WRITER); - } - - public void groupingColumnsBasicWriteTestsImpl(TestParquetTableWriter writer) { - // Create an empty parent directory - final File parentDir = new File(rootFile, "tempDir"); - parentDir.mkdir(); - assertTrue(parentDir.exists() && parentDir.isDirectory() && parentDir.list().length == 0); - - Integer data[] = new Integer[500 * 4]; - for (int i = 0; i < data.length; i++) { - data[i] = i / 4; - } - final TableDefinition tableDefinition = TableDefinition.of(ColumnDefinition.ofInt("vvv").withGrouping()); - final Table tableToSave = newTable(tableDefinition, TableTools.col("vvv", data)); - - final String destFilename = "groupingColumnsWriteTests.parquet"; - final File destFile = new File(parentDir, destFilename); - writer.writeTable(tableToSave, destFile); - String vvvIndexFilePath = ".dh_metadata/indexes/vvv/index_vvv_groupingColumnsWriteTests.parquet"; - verifyFilesInDir(parentDir, new String[] {destFilename}, Map.of("vvv", new String[] {vvvIndexFilePath})); - - checkSingleTable(tableToSave, destFile); - - // Verify that the key-value metadata in the file has the correct name - ParquetTableLocationKey tableLocationKey = - new ParquetTableLocationKey(destFile, 0, null, ParquetInstructions.EMPTY); - String metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); - assertTrue(metadataString.contains(vvvIndexFilePath)); - - // Write another table but this write should fail - final TableDefinition badTableDefinition = TableDefinition.of(ColumnDefinition.ofInt("www").withGrouping()); - final Table badTable = newTable(badTableDefinition, TableTools.col("www", data)) - .updateView("InputString = ii % 2 == 0 ? Long.toString(ii) : null", "A=InputString.charAt(0)"); - try { - writer.writeTable(badTable, destFile); - TestCase.fail("Exception expected for invalid formula"); - } catch (UncheckedDeephavenException e) { - assertTrue(e.getCause() instanceof FormulaEvaluationException); - } - - // Make sure that original file is preserved and no temporary files - verifyFilesInDir(parentDir, new String[] {destFilename}, Map.of("vvv", new String[] {vvvIndexFilePath})); - checkSingleTable(tableToSave, destFile); - FileUtils.deleteRecursively(parentDir); - } - - @Test - public void legacyGroupingFileReadTest() { - final String path = - ParquetTableReadWriteTest.class.getResource("/ParquetDataWithLegacyGroupingInfo.parquet").getFile(); - final File destFile = new File(path); - - // Read the legacy file and verify that grouping column is read correctly - final Table fromDisk = readParquetFileFromGitLFS(destFile); - final String groupingColName = "gcol"; - assertTrue(fromDisk.getDefinition().getColumn(groupingColName).isGrouping()); - - // Verify that the key-value metadata in the file has the correct legacy grouping file name - final ParquetTableLocationKey tableLocationKey = - new ParquetTableLocationKey(destFile, 0, null, ParquetInstructions.EMPTY); - final String metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); - String groupingFileName = ParquetTools.legacyGroupingFileName(destFile, groupingColName); - assertTrue(metadataString.contains(groupingFileName)); - - // Following is how this file was generated, so verify the table read from disk against this - Integer data[] = new Integer[500 * 4]; - for (int i = 0; i < data.length; i++) { - data[i] = i / 4; - } - final TableDefinition tableDefinition = - TableDefinition.of(ColumnDefinition.ofInt(groupingColName).withGrouping()); - final Table table = newTable(tableDefinition, TableTools.col(groupingColName, data)); - assertTableEquals(fromDisk, table); - } - - @Test - public void parquetDirectoryWithDotFilesTest() throws IOException { - // Create an empty parent directory - final File parentDir = new File(rootFile, "tempDir"); - parentDir.mkdir(); - assertTrue(parentDir.exists() && parentDir.isDirectory() && parentDir.list().length == 0); - - Integer data[] = new Integer[500 * 4]; - for (int i = 0; i < data.length; i++) { - data[i] = i / 4; - } - final TableDefinition tableDefinition = TableDefinition.of(ColumnDefinition.ofInt("vvv").withGrouping()); - final Table tableToSave = newTable(tableDefinition, TableTools.col("vvv", data)); - - final String destFilename = "data.parquet"; - final File destFile = new File(parentDir, destFilename); - writeTable(tableToSave, destFile); - String vvvIndexFilePath = ".dh_metadata/indexes/vvv/index_vvv_data.parquet"; - verifyFilesInDir(parentDir, new String[] {destFilename}, Map.of("vvv", new String[] {vvvIndexFilePath})); - - // Call readTable on parent directory - Table fromDisk = readFlatPartitionedTable(parentDir, EMPTY); - assertTableEquals(fromDisk, tableToSave); - - // Add an empty dot file and dot directory (with valid parquet files) in the parent directory - final File dotFile = new File(parentDir, ".dotFile"); - assertTrue(dotFile.createNewFile()); - final File dotDir = new File(parentDir, ".dotDir"); - assertTrue(dotDir.mkdir()); - final Table someTable = TableTools.emptyTable(5).update("A=(int)i"); - writeTable(someTable, new File(dotDir, "data.parquet")); - fromDisk = readFlatPartitionedTable(parentDir, EMPTY); - assertTableEquals(fromDisk, tableToSave); - - // Add a dot parquet in parent directory - final Table anotherTable = TableTools.emptyTable(5).update("A=(int)i"); - final File pqDotFile = new File(parentDir, ".dotFile.parquet"); - writeTable(anotherTable, pqDotFile); - fromDisk = readFlatPartitionedTable(parentDir, EMPTY); - assertTableEquals(fromDisk, tableToSave); - } - - @Test - public void partitionedParquetWithDotFilesTest() throws IOException { - // Create an empty parent directory - final File parentDir = new File(rootFile, "tempDir"); - parentDir.mkdir(); - assertTrue(parentDir.exists() && parentDir.isDirectory() && parentDir.list().length == 0); - - final Table someTable = TableTools.emptyTable(5).update("A=(int)i"); - final File firstPartition = new File(parentDir, "X=A"); - final File firstDataFile = new File(firstPartition, "data.parquet"); - final File secondPartition = new File(parentDir, "X=B"); - final File secondDataFile = new File(secondPartition, "data.parquet"); - - writeTable(someTable, firstDataFile); - writeTable(someTable, secondDataFile); - - Table partitionedTable = readKeyValuePartitionedTable(parentDir, EMPTY).select(); - final Set columnsSet = partitionedTable.getDefinition().getColumnNameSet(); - assertTrue(columnsSet.size() == 2 && columnsSet.contains("A") && columnsSet.contains("X")); - - // Add an empty dot file and dot directory (with valid parquet files) in one of the partitions - final File dotFile = new File(firstPartition, ".dotFile"); - assertTrue(dotFile.createNewFile()); - final File dotDir = new File(firstPartition, ".dotDir"); - assertTrue(dotDir.mkdir()); - writeTable(someTable, new File(dotDir, "data.parquet")); - Table fromDisk = readKeyValuePartitionedTable(parentDir, EMPTY); - assertTableEquals(fromDisk, partitionedTable); - - // Add a dot parquet file in one of the partitions directory - final Table anotherTable = TableTools.emptyTable(5).update("B=(int)i"); - final File pqDotFile = new File(secondPartition, ".dotFile.parquet"); - writeTable(anotherTable, pqDotFile); - fromDisk = readKeyValuePartitionedTable(parentDir, EMPTY); - assertTableEquals(fromDisk, partitionedTable); - } - - /** - * These are tests for writing multiple parquet tables with grouping columns. - */ - @Test - public void writeMultiTableGroupingColumnTest() { - // Create an empty parent directory - final File parentDir = new File(rootFile, "tempDir"); - parentDir.mkdir(); - - Integer data[] = new Integer[500 * 4]; - for (int i = 0; i < data.length; i++) { - data[i] = i / 4; - } - final TableDefinition tableDefinition = TableDefinition.of(ColumnDefinition.ofInt("vvv").withGrouping()); - final Table firstTable = newTable(tableDefinition, TableTools.col("vvv", data)); - final String firstFilename = "firstTable.parquet"; - final File firstDestFile = new File(parentDir, firstFilename); - - final Table secondTable = newTable(tableDefinition, TableTools.col("vvv", data)); - final String secondFilename = "secondTable.parquet"; - final File secondDestFile = new File(parentDir, secondFilename); - - Table[] tablesToSave = new Table[] {firstTable, secondTable}; - File[] destFiles = new File[] {firstDestFile, secondDestFile}; - - ParquetTools.writeTables(tablesToSave, firstTable.getDefinition(), destFiles); - - String firstIndexFilePath = ".dh_metadata/indexes/vvv/index_vvv_firstTable.parquet"; - String secondIndexFilePath = ".dh_metadata/indexes/vvv/index_vvv_secondTable.parquet"; - verifyFilesInDir(parentDir, new String[] {firstFilename, secondFilename}, - Map.of("vvv", new String[] {firstIndexFilePath, secondIndexFilePath})); - - // Verify that the key-value metadata in the file has the correct name - ParquetTableLocationKey tableLocationKey = - new ParquetTableLocationKey(firstDestFile, 0, null, ParquetInstructions.EMPTY); - String metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); - assertTrue(metadataString.contains(firstIndexFilePath)); - tableLocationKey = new ParquetTableLocationKey(secondDestFile, 0, null, ParquetInstructions.EMPTY); - metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); - assertTrue(metadataString.contains(secondIndexFilePath)); - - // Read back the files and verify contents match - checkSingleTable(firstTable, firstDestFile); - checkSingleTable(secondTable, secondDestFile); - } - - @Test - public void groupingColumnsOverwritingTests() { - groupingColumnsOverwritingTestsImpl(SINGLE_WRITER); - groupingColumnsOverwritingTestsImpl(MULTI_WRITER); - } - - public void groupingColumnsOverwritingTestsImpl(TestParquetTableWriter writer) { - // Create an empty parent directory - final File parentDir = new File(rootFile, "tempDir"); - parentDir.mkdir(); - assertTrue(parentDir.exists() && parentDir.isDirectory() && parentDir.list().length == 0); - - Integer data[] = new Integer[500 * 4]; - for (int i = 0; i < data.length; i++) { - data[i] = i / 4; - } - final TableDefinition tableDefinition = TableDefinition.of(ColumnDefinition.ofInt("vvv").withGrouping()); - final Table tableToSave = newTable(tableDefinition, TableTools.col("vvv", data)); - - final String destFilename = "groupingColumnsWriteTests.parquet"; - final File destFile = new File(parentDir, destFilename); - writer.writeTable(tableToSave, destFile); - String vvvIndexFilePath = ".dh_metadata/indexes/vvv/index_vvv_groupingColumnsWriteTests.parquet"; - - // Write a new table successfully at the same position with different grouping columns - final TableDefinition anotherTableDefinition = TableDefinition.of(ColumnDefinition.ofInt("xxx").withGrouping()); - Table anotherTableToSave = newTable(anotherTableDefinition, TableTools.col("xxx", data)); - writer.writeTable(anotherTableToSave, destFile); - final String xxxIndexFilePath = ".dh_metadata/indexes/xxx/index_xxx_groupingColumnsWriteTests.parquet"; - - // The directory now should contain the updated table, its grouping file for column xxx, and old grouping file - // for column vvv - verifyFilesInDir(parentDir, new String[] {destFilename}, - Map.of("vvv", new String[] {vvvIndexFilePath}, - "xxx", new String[] {xxxIndexFilePath})); - - checkSingleTable(anotherTableToSave, destFile); - - ParquetTableLocationKey tableLocationKey = - new ParquetTableLocationKey(destFile, 0, null, ParquetInstructions.EMPTY); - String metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); - assertTrue(metadataString.contains(xxxIndexFilePath) && !metadataString.contains(vvvIndexFilePath)); - - // Overwrite the table - writer.writeTable(anotherTableToSave, destFile); - - // The directory should still contain the updated table, its grouping file for column xxx, and old grouping file - // for column vvv - final File xxxIndexFile = new File(parentDir, xxxIndexFilePath); - final File backupXXXIndexFile = ParquetTools.getBackupFile(xxxIndexFile); - final String backupXXXIndexFileName = backupXXXIndexFile.getName(); - verifyFilesInDir(parentDir, new String[] {destFilename}, - Map.of("vvv", new String[] {vvvIndexFilePath}, - "xxx", new String[] {xxxIndexFilePath})); - - tableLocationKey = new ParquetTableLocationKey(destFile, 0, null, ParquetInstructions.EMPTY); - metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); - assertTrue(metadataString.contains(xxxIndexFilePath) && !metadataString.contains(vvvIndexFilePath) - && !metadataString.contains(backupXXXIndexFileName)); - FileUtils.deleteRecursively(parentDir); - } - - @Test - public void readChangedUnderlyingFileTests() { - readChangedUnderlyingFileTestsImpl(SINGLE_WRITER); - readChangedUnderlyingFileTestsImpl(MULTI_WRITER); - } - - public void readChangedUnderlyingFileTestsImpl(TestParquetTableWriter writer) { - // Write a table to parquet file and read it back - final Table tableToSave = TableTools.emptyTable(5).update("A=(int)i", "B=(long)i", "C=(double)i"); - final String filename = "readChangedUnderlyingFileTests.parquet"; - final File destFile = new File(rootFile, filename); - writer.writeTable(tableToSave, destFile); - Table fromDisk = readSingleFileTable(destFile, EMPTY); - // At this point, fromDisk is not fully materialized in the memory and would be read from the file on demand - - // Change the underlying file - final Table stringTable = TableTools.emptyTable(5).update("InputString = Long.toString(ii)"); - writer.writeTable(stringTable, destFile); - Table stringFromDisk = readSingleFileTable(destFile, EMPTY).select(); - assertTableEquals(stringTable, stringFromDisk); - - // Close all the file handles so that next time when fromDisk is accessed, we need to reopen the file handle - TrackedFileHandleFactory.getInstance().closeAll(); - - // Read back fromDisk. Since the underlying file has changed, we expect this to fail. - try { - fromDisk.coalesce(); - TestCase.fail("Expected TableDataException"); - } catch (TableDataException ignored) { - // expected - } - } - - @Test - public void readModifyWriteTests() { - readModifyWriteTestsImpl(SINGLE_WRITER); - readModifyWriteTestsImpl(MULTI_WRITER); - } - - public void readModifyWriteTestsImpl(TestParquetTableWriter writer) { - // Write a table to parquet file and read it back - final Table tableToSave = TableTools.emptyTable(5).update("A=(int)i", "B=(long)i", "C=(double)i"); - final String filename = "readModifyWriteTests.parquet"; - final File destFile = new File(rootFile, filename); - writer.writeTable(tableToSave, destFile); - Table fromDisk = readSingleFileTable(destFile, EMPTY); - // At this point, fromDisk is not fully materialized in the memory and would be read from the file on demand - - // Create a view table on fromDisk which should fail on writing, and try to write at the same location - // Since we are doing a view() operation and adding a new column and overwriting an existing column, the table - // won't be materialized in memory or cache. - final Table badTable = - fromDisk.view("InputString = ii % 2 == 0 ? Long.toString(ii) : null", "A=InputString.charAt(0)"); - try { - writer.writeTable(badTable, destFile); - TestCase.fail(); - } catch (UncheckedDeephavenException e) { - assertTrue(e.getCause() instanceof FormulaEvaluationException); - } - - // Close all old file handles so that we read the file path fresh instead of using any old handles - TrackedFileHandleFactory.getInstance().closeAll(); - - // Read back fromDisk and compare it with original table. If the underlying file has not been corrupted or - // swapped out, then we would not be able to read from the file - assertTableEquals(tableToSave, fromDisk); - } - - @Test - public void dictionaryEncodingTest() { - Collection columns = new ArrayList<>(Arrays.asList( - "shortStringColumn = `Row ` + i", - "longStringColumn = `This is row ` + i", - "someIntColumn = i")); - final int numRows = 10; - final ParquetInstructions writeInstructions = new ParquetInstructions.Builder() - .setMaximumDictionarySize(100) // Force "longStringColumn" to use non-dictionary encoding - .build(); - final Table stringTable = TableTools.emptyTable(numRows).select(Selectable.from(columns)); - final File dest = new File(rootFile + File.separator + "dictEncoding.parquet"); - writeTable(stringTable, dest, writeInstructions); - checkSingleTable(stringTable, dest); - - // Verify that string columns are properly dictionary encoded - final ParquetMetadata metadata = - new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); - final String firstColumnMetadata = metadata.getBlocks().get(0).getColumns().get(0).toString(); - assertTrue(firstColumnMetadata.contains("shortStringColumn") && firstColumnMetadata.contains("RLE_DICTIONARY")); - final String secondColumnMetadata = metadata.getBlocks().get(0).getColumns().get(1).toString(); - assertTrue( - secondColumnMetadata.contains("longStringColumn") && !secondColumnMetadata.contains("RLE_DICTIONARY")); - final String thirdColumnMetadata = metadata.getBlocks().get(0).getColumns().get(2).toString(); - assertTrue(thirdColumnMetadata.contains("someIntColumn") && !thirdColumnMetadata.contains("RLE_DICTIONARY")); - } - - @Test - public void overflowingStringsTest() { - // Test the behavior of writing parquet files if entries exceed the page size limit - final int pageSize = ParquetInstructions.MIN_TARGET_PAGE_SIZE; - final char[] data = new char[pageSize / 4]; - String someString = new String(data); - Collection columns = new ArrayList<>(Arrays.asList( - "someStringColumn = `" + someString + "` + i%10")); - final long numRows = 10; - ColumnChunkMetaData columnMetadata = overflowingStringsTestHelper(columns, numRows, pageSize); - String metadataStr = columnMetadata.toString(); - assertTrue(metadataStr.contains("someStringColumn") && metadataStr.contains("PLAIN") - && !metadataStr.contains("RLE_DICTIONARY")); - - // We exceed page size on hitting 4 rows, and we have 10 total rows. - // Therefore, we should have total 4 pages containing 3, 3, 3, 1 rows respectively. - assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 4); - - final char[] veryLongData = new char[pageSize]; - someString = new String(veryLongData); - columns = new ArrayList<>( - Arrays.asList("someStringColumn = ii % 2 == 0 ? Long.toString(ii) : `" + someString + "` + ii")); - columnMetadata = overflowingStringsTestHelper(columns, numRows, pageSize); - // We will have 10 pages each containing 1 row. - assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 10); - - // Table with rows of null alternating with strings exceeding the page size - columns = new ArrayList<>(Arrays.asList("someStringColumn = ii % 2 == 0 ? null : `" + someString + "` + ii")); - columnMetadata = overflowingStringsTestHelper(columns, numRows, pageSize); - // We will have 6 pages containing 1, 2, 2, 2, 2, 1 rows. - assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 6); - } - - private static ColumnChunkMetaData overflowingStringsTestHelper(final Collection columns, - final long numRows, final int pageSize) { - final ParquetInstructions writeInstructions = new ParquetInstructions.Builder() - .setTargetPageSize(pageSize) // Force a small page size to cause splitting across pages - .setMaximumDictionarySize(50) // Force "someStringColumn" to use non-dictionary encoding - .build(); - Table stringTable = TableTools.emptyTable(numRows).select(Selectable.from(columns)); - final File dest = new File(rootFile + File.separator + "overflowingStringsTest.parquet"); - writeTable(stringTable, dest, writeInstructions); - checkSingleTable(stringTable, dest); - - ParquetMetadata metadata = new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); - ColumnChunkMetaData columnMetadata = metadata.getBlocks().get(0).getColumns().get(0); - return columnMetadata; - } - - @Test - public void overflowingCodecsTest() { - final int pageSize = ParquetInstructions.MIN_TARGET_PAGE_SIZE; - final ParquetInstructions writeInstructions = new ParquetInstructions.Builder() - .setTargetPageSize(pageSize) // Force a small page size to cause splitting across pages - .addColumnCodec("VariableWidthByteArrayColumn", SimpleByteArrayCodec.class.getName()) - .build(); - - final ColumnDefinition columnDefinition = - ColumnDefinition.fromGenericType("VariableWidthByteArrayColumn", byte[].class, byte.class); - final TableDefinition tableDefinition = TableDefinition.of(columnDefinition); - final byte[] byteArray = new byte[pageSize / 2]; - final Table table = newTable(tableDefinition, - TableTools.col("VariableWidthByteArrayColumn", byteArray, byteArray, byteArray)); - - final File dest = new File(rootFile + File.separator + "overflowingCodecsTest.parquet"); - writeTable(table, dest, writeInstructions); - checkSingleTable(table, dest); - - final ParquetMetadata metadata = - new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); - final String metadataStr = metadata.getFileMetaData().getKeyValueMetaData().get("deephaven"); - assertTrue( - metadataStr.contains("VariableWidthByteArrayColumn") && metadataStr.contains("SimpleByteArrayCodec")); - final ColumnChunkMetaData columnMetadata = metadata.getBlocks().get(0).getColumns().get(0); - final String columnMetadataStr = columnMetadata.toString(); - assertTrue(columnMetadataStr.contains("VariableWidthByteArrayColumn") && columnMetadataStr.contains("PLAIN")); - // Each byte array is of half the page size. So we exceed page size on hitting 3 byteArrays. - // Therefore, we should have total 2 pages containing 2, 1 rows respectively. - assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 2); - } - - @Test - public void readWriteStatisticsTest() { - // Test simple structured table. - final ColumnDefinition columnDefinition = - ColumnDefinition.fromGenericType("VariableWidthByteArrayColumn", byte[].class, byte.class); - final TableDefinition tableDefinition = TableDefinition.of(columnDefinition); - final byte[] byteArray = new byte[] {1, 2, 3, 4, NULL_BYTE, 6, 7, 8, 9, NULL_BYTE, 11, 12, 13}; - final Table simpleTable = newTable(tableDefinition, - TableTools.col("VariableWidthByteArrayColumn", null, byteArray, byteArray, byteArray, byteArray, - byteArray)); - final File simpleTableDest = new File(rootFile, "ParquetTest_simple_statistics_test.parquet"); - writeTable(simpleTable, simpleTableDest); - - checkSingleTable(simpleTable, simpleTableDest); - - assertTableStatistics(simpleTable, simpleTableDest); - - // Test flat columns. - final Table flatTableToSave = getTableFlat(10_000, true, true); - final File flatTableDest = new File(rootFile, "ParquetTest_flat_statistics_test.parquet"); - writeTable(flatTableToSave, flatTableDest); - - checkSingleTable(maybeFixBigDecimal(flatTableToSave), flatTableDest); - - assertTableStatistics(flatTableToSave, flatTableDest); - - // Test nested columns. - final Table groupedTableToSave = getGroupedTable(10_000, true); - final File groupedTableDest = new File(rootFile, "ParquetTest_grouped_statistics_test.parquet"); - writeTable(groupedTableToSave, groupedTableDest, groupedTableToSave.getDefinition()); - - checkSingleTable(groupedTableToSave, groupedTableDest); - - assertTableStatistics(groupedTableToSave, groupedTableDest); - } - - @Test - public void readWriteDateTimeTest() { - final int NUM_ROWS = 1000; - final Table table = TableTools.emptyTable(NUM_ROWS).view( - "someDateColumn = java.time.LocalDate.ofEpochDay(i)", - "someTimeColumn = java.time.LocalTime.of(i%24, i%60, (i+10)%60)", - "someLocalDateTimeColumn = java.time.LocalDateTime.of(2000+i%10, i%12+1, i%30+1, (i+4)%24, (i+5)%60, (i+6)%60, i)", - "someInstantColumn = DateTimeUtils.now() + i").select(); - final File dest = new File(rootFile, "readWriteDateTimeTest.parquet"); - writeReadTableTest(table, dest); - - // Verify that the types are correct in the schema - final ParquetMetadata metadata = - new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); - final ColumnChunkMetaData dateColMetadata = metadata.getBlocks().get(0).getColumns().get(0); - assertTrue(dateColMetadata.toString().contains("someDateColumn")); - assertEquals(PrimitiveType.PrimitiveTypeName.INT32, dateColMetadata.getPrimitiveType().getPrimitiveTypeName()); - assertEquals(LogicalTypeAnnotation.dateType(), dateColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); - - final ColumnChunkMetaData timeColMetadata = metadata.getBlocks().get(0).getColumns().get(1); - assertTrue(timeColMetadata.toString().contains("someTimeColumn")); - assertEquals(PrimitiveType.PrimitiveTypeName.INT64, timeColMetadata.getPrimitiveType().getPrimitiveTypeName()); - assertEquals(LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.NANOS), - timeColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); - - final ColumnChunkMetaData localDateTimeColMetadata = metadata.getBlocks().get(0).getColumns().get(2); - assertTrue(localDateTimeColMetadata.toString().contains("someLocalDateTimeColumn")); - assertEquals(PrimitiveType.PrimitiveTypeName.INT64, - localDateTimeColMetadata.getPrimitiveType().getPrimitiveTypeName()); - assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.NANOS), - localDateTimeColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); - - final ColumnChunkMetaData instantColMetadata = metadata.getBlocks().get(0).getColumns().get(3); - assertTrue(instantColMetadata.toString().contains("someInstantColumn")); - assertEquals(PrimitiveType.PrimitiveTypeName.INT64, - instantColMetadata.getPrimitiveType().getPrimitiveTypeName()); - assertEquals(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.NANOS), - instantColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); - } - - /** - * Test our manual verification techniques against a file generated by pyarrow. Here is the code to produce the file - * when/if this file needs to be re-generated or changed. - * - *

-     * ###############################################################################
-     * import pyarrow.parquet
-     *
-     * pa_table = pyarrow.table({
-     *     'int': [0, None, 100, -100],
-     *     'float': [0.0, None, 100.0, -100.0],
-     *     'string': ["aaa", None, "111", "ZZZ"],
-     *     'intList': [
-     *         [0, None, 2],
-     *         None,
-     *         [3, 4, 6, 7, 8, 9, 10, 100, -100],
-     *         [5]
-     *     ],
-     *     'floatList': [
-     *         [0.0, None, 2.0],
-     *         None,
-     *         [3.0, 4.0, 6.0, 7.0, 8.0, 9.0, 10.0, 100.0, -100.0],
-     *         [5.0]
-     *     ],
-     *     'stringList': [
-     *         ["aaa", None, None],
-     *         None,
-     *         ["111", "zzz", "ZZZ", "AAA"],
-     *         ["ccc"]
-     *     ]})
-     * pyarrow.parquet.write_table(pa_table, './extensions/parquet/table/src/test/resources/e0/pyarrow_stats.parquet')
-     * ###############################################################################
-     * 
- */ - @Test - public void verifyPyArrowStatistics() { - final String path = ParquetTableReadWriteTest.class.getResource("/e0/pyarrow_stats.parquet").getFile(); - final File pyarrowDest = new File(path); - final Table pyarrowFromDisk = readParquetFileFromGitLFS(pyarrowDest); - - // Verify that our verification code works for a pyarrow generated table. - assertTableStatistics(pyarrowFromDisk, pyarrowDest); - - // Write the table to disk using our code. - final File dhDest = new File(rootFile, "ParquetTest_statistics_test.parquet"); - writeTable(pyarrowFromDisk, dhDest); - - final Table dhFromDisk = checkSingleTable(pyarrowFromDisk, dhDest); - - // Run the verification code against DHC writer stats. - assertTableStatistics(pyarrowFromDisk, dhDest); - assertTableStatistics(dhFromDisk, dhDest); - } - - @Test - public void singleTable() { - final File fooSource = new File(rootFile, "singleTable/foo.parquet"); - final File fooBarSource = new File(rootFile, "singleTable/fooBar.parquet"); - final File barSource = new File(rootFile, "singleTable/bar.parquet"); - - final Table foo; - final Table fooBar; - final Table bar; - final Table fooBarNullFoo; - final Table fooBarNullBar; - - final TableDefinition fooDefinition; - final TableDefinition fooBarDefinition; - final TableDefinition barDefinition; - { - fooSource.mkdirs(); - fooBarSource.mkdirs(); - barSource.mkdirs(); - - final ColumnHolder fooCol = intCol("Foo", 1, 2, 3); - final ColumnHolder barCol = stringCol("Bar", "Zip", "Zap", "Zoom"); - - final ColumnHolder nullFooCol = - intCol("Foo", QueryConstants.NULL_INT, QueryConstants.NULL_INT, QueryConstants.NULL_INT); - final ColumnHolder nullBarCol = stringCol("Bar", null, null, null); - - final ColumnDefinition fooColDef = ColumnDefinition.ofInt("Foo"); - final ColumnDefinition barColDef = ColumnDefinition.ofString("Bar"); - - fooDefinition = TableDefinition.of(fooColDef); - fooBarDefinition = TableDefinition.of(fooColDef, barColDef); - barDefinition = TableDefinition.of(barColDef); - - foo = newTable(fooDefinition, fooCol); - fooBar = newTable(fooBarDefinition, fooCol, barCol); - bar = newTable(barDefinition, barCol); - - fooBarNullFoo = newTable(fooBarDefinition, nullFooCol, barCol); - fooBarNullBar = newTable(fooBarDefinition, fooCol, nullBarCol); - - writeTable(foo, fooSource); - writeTable(fooBar, fooBarSource); - writeTable(bar, barSource); - } - - // Infer - { - checkSingleTable(foo, fooSource); - checkSingleTable(fooBar, fooBarSource); - checkSingleTable(bar, barSource); - } - - // readTable inference to readSingleTable - { - assertTableEquals(foo, readTable(fooSource)); - assertTableEquals(fooBar, readTable(fooBarSource)); - assertTableEquals(bar, readTable(barSource)); - } - - // Explicit - { - assertTableEquals(foo, readSingleFileTable(fooSource, EMPTY, fooDefinition)); - assertTableEquals(fooBar, readSingleFileTable(fooBarSource, EMPTY, fooBarDefinition)); - assertTableEquals(bar, readSingleFileTable(barSource, EMPTY, barDefinition)); - } - - // Explicit subset - { - // fooBar as foo - assertTableEquals(foo, readSingleFileTable(fooBarSource, EMPTY, fooDefinition)); - // fooBar as bar - assertTableEquals(bar, readSingleFileTable(fooBarSource, EMPTY, barDefinition)); - } - - // Explicit superset - { - // foo as fooBar - assertTableEquals(fooBarNullBar, readSingleFileTable(fooSource, EMPTY, fooBarDefinition)); - // bar as fooBar - assertTableEquals(fooBarNullFoo, readSingleFileTable(barSource, EMPTY, fooBarDefinition)); - } - - // No refreshing single table support - { - try { - readSingleFileTable(fooSource, REFRESHING); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException e) { - assertEquals("Unable to have a refreshing single parquet file", e.getMessage()); - } - - try { - readSingleFileTable(fooSource, REFRESHING, fooDefinition); - fail("Expected IllegalArgumentException"); - } catch (IllegalArgumentException e) { - assertEquals("Unable to have a refreshing single parquet file", e.getMessage()); - } - } - } - - @Test - public void flatPartitionedTable() { - // Create an empty parent directory - final File source = new File(rootFile, "flatPartitionedTable/source"); - final File emptySource = new File(rootFile, "flatPartitionedTable/emptySource"); - - final Table formerData; - final Table latterData; - final TableDefinition formerDefinition; - final TableDefinition latterDefinition; - final Runnable writeIntoEmptySource; - { - final File p1File = new File(source, "01.parquet"); - final File p2File = new File(source, "02.parquet"); - - final File p1FileEmpty = new File(emptySource, "01.parquet"); - final File p2FileEmpty = new File(emptySource, "02.parquet"); - - p1File.mkdirs(); - p2File.mkdirs(); - emptySource.mkdirs(); - - final ColumnHolder foo1 = intCol("Foo", 1, 2, 3); - final ColumnHolder foo2 = intCol("Foo", 4, 5); - - final ColumnHolder bar1 = stringCol("Bar", null, null, null); - final ColumnHolder bar2 = stringCol("Bar", "Zip", "Zap"); - - final Table p1 = newTable(foo1); - final Table p2 = newTable(foo2, bar2); - writeTable(p1, p1File); - writeTable(p2, p2File); - writeIntoEmptySource = () -> { - p1FileEmpty.mkdirs(); - p2FileEmpty.mkdirs(); - writeTable(p1, p1FileEmpty); - writeTable(p2, p2FileEmpty); - }; - - final ColumnDefinition foo = ColumnDefinition.ofInt("Foo"); - final ColumnDefinition bar = ColumnDefinition.ofString("Bar"); - - formerDefinition = TableDefinition.of(foo); - latterDefinition = TableDefinition.of(foo, bar); - - formerData = merge( - newTable(formerDefinition, foo1), - newTable(formerDefinition, foo2)); - latterData = merge( - newTable(latterDefinition, foo1, bar1), - newTable(latterDefinition, foo2, bar2)); - } - - // Infer from last key - { - final Table table = readFlatPartitionedTable(source, EMPTY); - assertTableEquals(latterData, table); - } - // Infer from last key, refreshing - { - final Table table = readFlatPartitionedTable(source, REFRESHING); - assertTableEquals(latterData, table); - } - // readTable inference to readFlatPartitionedTable - { - assertTableEquals(latterData, readTable(source)); - } - - // Explicit latter definition - { - final Table table = readFlatPartitionedTable(source, EMPTY, latterDefinition); - assertTableEquals(latterData, table); - } - // Explicit latter definition, refreshing - { - final Table table = readFlatPartitionedTable(source, REFRESHING, latterDefinition); - assertTableEquals(latterData, table); - } - - // Explicit former definition - { - final Table table = readFlatPartitionedTable(source, EMPTY, formerDefinition); - assertTableEquals(formerData, table); - } - // Explicit former definition, refreshing - { - final Table table = readFlatPartitionedTable(source, REFRESHING, formerDefinition); - assertTableEquals(formerData, table); - } - - // Explicit definition, empty directory - { - final Table table = readFlatPartitionedTable(emptySource, EMPTY, latterDefinition); - assertTableEquals(TableTools.newTable(latterDefinition), table); - } - // Explicit definition, empty directory, refreshing with new data added - { - final Table table = readFlatPartitionedTable(emptySource, REFRESHING, latterDefinition); - assertTableEquals(TableTools.newTable(latterDefinition), table); - - writeIntoEmptySource.run(); - ExecutionContext.getContext().getUpdateGraph().cast().runWithinUnitTestCycle(() -> { - // This is not generally a good way to do this sort of testing. Ideally, we'd be a bit smarter and use - // a test-driven TableDataRefreshService.getSharedRefreshService. - ((SourceTable) table).tableLocationProvider().refresh(); - ((SourceTable) table).refresh(); - assertTableEquals(latterData, table); - }); - } - } - - @Test - public void keyValuePartitionedTable() { - final File source = new File(rootFile, "keyValuePartitionedTable/source"); - final File emptySource = new File(rootFile, "keyValuePartitionedTable/emptySource"); - - final Table formerData; - final Table latterData; - final TableDefinition formerDefinition; - final TableDefinition latterDefinition; - final Runnable writeIntoEmptySource; - { - final File p1File = new File(source, "Partition=1/z.parquet"); - final File p2File = new File(source, "Partition=2/a.parquet"); - - final File p1FileEmpty = new File(emptySource, "Partition=1/z.parquet"); - final File p2FileEmpty = new File(emptySource, "Partition=2/a.parquet"); - - p1File.mkdirs(); - p2File.mkdirs(); - emptySource.mkdirs(); - - final ColumnHolder part1 = intCol("Partition", 1, 1, 1); - final ColumnHolder part2 = intCol("Partition", 2, 2); - - final ColumnHolder foo1 = intCol("Foo", 1, 2, 3); - final ColumnHolder foo2 = intCol("Foo", 4, 5); - - final ColumnHolder bar1 = stringCol("Bar", null, null, null); - final ColumnHolder bar2 = stringCol("Bar", "Zip", "Zap"); - - final Table p1 = newTable(foo1); - final Table p2 = newTable(foo2, bar2); - writeTable(p1, p1File); - writeTable(p2, p2File); - writeIntoEmptySource = () -> { - p1FileEmpty.mkdirs(); - p2FileEmpty.mkdirs(); - writeTable(p1, p1FileEmpty); - writeTable(p2, p2FileEmpty); - }; - - // Need to be explicit w/ definition so partitioning column applied to expected tables - final ColumnDefinition partition = ColumnDefinition.ofInt("Partition").withPartitioning(); - final ColumnDefinition foo = ColumnDefinition.ofInt("Foo"); - final ColumnDefinition bar = ColumnDefinition.ofString("Bar"); - - // Note: merge does not preserve partition column designation, so we need to explicitly create them - formerDefinition = TableDefinition.of(partition, foo); - latterDefinition = TableDefinition.of(partition, foo, bar); - - formerData = merge( - newTable(formerDefinition, part1, foo1), - newTable(formerDefinition, part2, foo2)); - latterData = merge( - newTable(latterDefinition, part1, foo1, bar1), - newTable(latterDefinition, part2, foo2, bar2)); - } - - // Infer from last key - { - final Table table = readKeyValuePartitionedTable(source, EMPTY); - assertTableEquals(latterData, table); - } - // Infer from last key, refreshing - { - final Table table = readKeyValuePartitionedTable(source, REFRESHING); - assertTableEquals(latterData, table); - } - // readTable inference readKeyValuePartitionedTable - { - assertTableEquals(latterData, readTable(source)); - } - - // Explicit latter definition - { - final Table table = readKeyValuePartitionedTable(source, EMPTY, latterDefinition); - assertTableEquals(latterData, table); - } - // Explicit latter definition, refreshing - { - final Table table = readKeyValuePartitionedTable(source, REFRESHING, latterDefinition); - assertTableEquals(latterData, table); - } - - // Explicit former definition - { - final Table table = readKeyValuePartitionedTable(source, EMPTY, formerDefinition); - assertTableEquals(formerData, table); - } - // Explicit former definition, refreshing - { - final Table table = readKeyValuePartitionedTable(source, REFRESHING, formerDefinition); - assertTableEquals(formerData, table); - } - - // Explicit definition, empty directory - { - final Table table = readKeyValuePartitionedTable(emptySource, EMPTY, latterDefinition); - assertTableEquals(TableTools.newTable(latterDefinition), table); - } - // Explicit definition, empty directory, refreshing with new data added - { - final Table table = readKeyValuePartitionedTable(emptySource, REFRESHING, latterDefinition); - assertTableEquals(TableTools.newTable(latterDefinition), table); - - writeIntoEmptySource.run(); - ExecutionContext.getContext().getUpdateGraph().cast().runWithinUnitTestCycle(() -> { - // This is not generally a good way to do this sort of testing. Ideally, we'd be a bit smarter and use - // a test-driven TableDataRefreshService.getSharedRefreshService. - ((SourceTable) table).tableLocationProvider().refresh(); - ((SourceTable) table).refresh(); - assertTableEquals(latterData, table); - }); - } - } - - @Test - public void readSingleColumn() { - final File file = new File(rootFile, "readSingleColumn.parquet"); - final Table primitives = newTable( - booleanCol("Bool", null, true), - charCol("Char", NULL_CHAR, (char) 42), - byteCol("Byte", NULL_BYTE, (byte) 42), - shortCol("Short", NULL_SHORT, (short) 42), - intCol("Int", NULL_INT, 42), - longCol("Long", NULL_LONG, 42L), - floatCol("Float", NULL_FLOAT, 42.0f), - doubleCol("Double", NULL_DOUBLE, 42.0), - stringCol("String", null, "42"), - instantCol("Instant", null, Instant.ofEpochMilli(42))); - { - writeTable(primitives, file); - } - assertTableEquals( - primitives.view("Bool"), - readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofBoolean("Bool")))); - assertTableEquals( - primitives.view("Char"), - readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofChar("Char")))); - assertTableEquals( - primitives.view("Byte"), - readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofByte("Byte")))); - assertTableEquals( - primitives.view("Short"), - readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofShort("Short")))); - assertTableEquals( - primitives.view("Int"), - readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofInt("Int")))); - assertTableEquals( - primitives.view("Long"), - readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofLong("Long")))); - assertTableEquals( - primitives.view("Float"), - readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofFloat("Float")))); - assertTableEquals( - primitives.view("Double"), - readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofDouble("Double")))); - assertTableEquals( - primitives.view("String"), - readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofString("String")))); - assertTableEquals( - primitives.view("Instant"), - readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofTime("Instant")))); - } - - private void assertTableStatistics(Table inputTable, File dest) { - // Verify that the columns have the correct statistics. - final ParquetMetadata metadata = - new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); - - final String[] colNames = inputTable.getDefinition().getColumnNamesArray(); - for (int colIdx = 0; colIdx < inputTable.numColumns(); ++colIdx) { - final String colName = colNames[colIdx]; - - final ColumnSource columnSource = inputTable.getColumnSource(colName); - final ColumnChunkMetaData columnChunkMetaData = metadata.getBlocks().get(0).getColumns().get(colIdx); - final Statistics statistics = columnChunkMetaData.getStatistics(); - - final Class csType = columnSource.getType(); - - if (csType == boolean.class || csType == Boolean.class) { - assertBooleanColumnStatistics( - new SerialByteColumnIterator( - ReinterpretUtils.booleanToByteSource((ColumnSource) columnSource), - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == Boolean[].class) { - assertBooleanArrayColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == byte.class || csType == Byte.class) { - assertByteColumnStatistics( - new SerialByteColumnIterator( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == byte[].class) { - assertByteArrayColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == ByteVector.class) { - assertByteVectorColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == char.class || csType == Character.class) { - assertCharColumnStatistics( - new SerialCharacterColumnIterator( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == char[].class) { - assertCharArrayColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == CharVector.class) { - assertCharVectorColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == short.class || csType == Short.class) { - assertShortColumnStatistics( - new SerialShortColumnIterator( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == short[].class) { - assertShortArrayColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == ShortVector.class) { - assertShortVectorColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == int.class || csType == Integer.class) { - assertIntColumnStatistics( - new SerialIntegerColumnIterator( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == int[].class) { - assertIntArrayColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == IntVector.class) { - assertIntVectorColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == long.class || csType == Long.class) { - assertLongColumnStatistics( - new SerialLongColumnIterator( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == long[].class) { - assertLongArrayColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == LongVector.class) { - assertLongVectorColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == float.class || csType == Float.class) { - assertFloatColumnStatistics( - new SerialFloatColumnIterator( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == float[].class) { - assertFloatArrayColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == FloatVector.class) { - assertFloatVectorColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == double.class || csType == Double.class) { - assertDoubleColumnStatistics( - new SerialDoubleColumnIterator( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == double[].class) { - assertDoubleArrayColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == DoubleVector.class) { - assertDoubleVectorColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == String.class) { - assertStringColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == String[].class) { - assertStringArrayColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == ObjectVector.class && columnSource.getComponentType() == String.class) { - assertStringVectorColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource>) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == BigInteger.class) { - assertBigIntegerColumnStatistics( - new SerialObjectColumnIterator( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == BigDecimal.class) { - assertBigDecimalColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == Instant.class) { - assertInstantColumnStatistic( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == Instant[].class) { - assertInstantArrayColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else if (csType == ObjectVector.class && columnSource.getComponentType() == Instant.class) { - assertInstantVectorColumnStatistics( - new SerialObjectColumnIterator<>( - (ColumnSource>) columnSource, - inputTable.getRowSet()), - (Statistics) statistics); - } else { - // We can't verify statistics for this column type, so just skip it. - System.out.println("Ignoring column " + colName + " of type " + csType.getName()); - } - } - } - - // region Column Statistics Assertions - private void assertBooleanColumnStatistics(SerialByteColumnIterator iterator, Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_BYTE); - MutableInt max = new MutableInt(NULL_BYTE); - - iterator.forEachRemaining((ByteConsumer) value -> { - itemCount.increment(); - if (value == NULL_BYTE) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_BYTE || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_BYTE || value > max.getValue()) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue() == 1, statistics.genericGetMin()); - assertEquals(max.getValue() == 1, statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertBooleanArrayColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_BYTE); - MutableInt max = new MutableInt(NULL_BYTE); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final Boolean value : values) { - itemCount.increment(); - if (value == null) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_BYTE || (value ? 1 : 0) < min.getValue()) { - min.setValue(value ? 1 : 0); - } - if (max.getValue() == NULL_BYTE || (value ? 1 : 0) > max.getValue()) { - max.setValue(value ? 1 : 0); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue() == 1, statistics.genericGetMin()); - assertEquals(max.getValue() == 1, statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertByteColumnStatistics(SerialByteColumnIterator iterator, Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_BYTE); - MutableInt max = new MutableInt(NULL_BYTE); - - iterator.forEachRemaining((ByteConsumer) value -> { - itemCount.increment(); - if (value == NULL_BYTE) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_BYTE || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_BYTE || value > max.getValue()) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertByteArrayColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_BYTE); - MutableInt max = new MutableInt(NULL_BYTE); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final byte value : values) { - itemCount.increment(); - if (value == NULL_BYTE) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_BYTE || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_BYTE || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertByteVectorColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_BYTE); - MutableInt max = new MutableInt(NULL_BYTE); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final byte value : values) { - itemCount.increment(); - if (value == NULL_BYTE) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_BYTE || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_BYTE || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertCharColumnStatistics(SerialCharacterColumnIterator iterator, Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_CHAR); - MutableInt max = new MutableInt(NULL_CHAR); - - iterator.forEachRemaining((CharConsumer) value -> { - itemCount.increment(); - if (value == NULL_CHAR) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_CHAR || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_CHAR || value > max.getValue()) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertCharArrayColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_CHAR); - MutableInt max = new MutableInt(NULL_CHAR); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final char value : values) { - itemCount.increment(); - if (value == NULL_CHAR) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_CHAR || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_CHAR || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertCharVectorColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_CHAR); - MutableInt max = new MutableInt(NULL_CHAR); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final char value : values) { - itemCount.increment(); - if (value == NULL_CHAR) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_CHAR || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_CHAR || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertShortColumnStatistics(SerialShortColumnIterator iterator, Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_SHORT); - MutableInt max = new MutableInt(NULL_SHORT); - - iterator.forEachRemaining((ShortConsumer) value -> { - itemCount.increment(); - if (value == NULL_SHORT) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_SHORT || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_SHORT || value > max.getValue()) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertShortArrayColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_SHORT); - MutableInt max = new MutableInt(NULL_SHORT); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final short value : values) { - itemCount.increment(); - if (value == NULL_SHORT) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_SHORT || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_SHORT || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertShortVectorColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_SHORT); - MutableInt max = new MutableInt(NULL_SHORT); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final short value : values) { - itemCount.increment(); - if (value == NULL_SHORT) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_SHORT || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_SHORT || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertIntColumnStatistics(SerialIntegerColumnIterator iterator, Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_INT); - MutableInt max = new MutableInt(NULL_INT); - - iterator.forEachRemaining((IntConsumer) value -> { - itemCount.increment(); - if (value == NULL_INT) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_INT || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_INT || value > max.getValue()) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertIntArrayColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_INT); - MutableInt max = new MutableInt(NULL_INT); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final int value : values) { - itemCount.increment(); - if (value == NULL_INT) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_INT || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_INT || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertIntVectorColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableInt min = new MutableInt(NULL_INT); - MutableInt max = new MutableInt(NULL_INT); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final int value : values) { - itemCount.increment(); - if (value == NULL_INT) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_INT || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_INT || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertLongColumnStatistics(SerialLongColumnIterator iterator, Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableLong min = new MutableLong(NULL_LONG); - MutableLong max = new MutableLong(NULL_LONG); - - iterator.forEachRemaining((LongConsumer) value -> { - itemCount.increment(); - if (value == NULL_LONG) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_LONG || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_LONG || value > max.getValue()) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertLongArrayColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableLong min = new MutableLong(NULL_LONG); - MutableLong max = new MutableLong(NULL_LONG); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final long value : values) { - itemCount.increment(); - if (value == NULL_LONG) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_LONG || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_LONG || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertLongVectorColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableLong min = new MutableLong(NULL_LONG); - MutableLong max = new MutableLong(NULL_LONG); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final long value : values) { - itemCount.increment(); - if (value == NULL_LONG) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_LONG || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_LONG || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertFloatColumnStatistics(SerialFloatColumnIterator iterator, Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableFloat min = new MutableFloat(NULL_FLOAT); - MutableFloat max = new MutableFloat(NULL_FLOAT); - - iterator.forEachRemaining((FloatConsumer) value -> { - itemCount.increment(); - if (value == NULL_FLOAT) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_FLOAT || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_FLOAT || value > max.getValue()) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - // Use FloatComparisons.compare() to handle -0.0f == 0.0f properly - assertEquals(FloatComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); - assertEquals(FloatComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertFloatArrayColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableFloat min = new MutableFloat(NULL_FLOAT); - MutableFloat max = new MutableFloat(NULL_FLOAT); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final float value : values) { - itemCount.increment(); - if (value == NULL_FLOAT) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_FLOAT || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_FLOAT || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - // Use FloatComparisons.compare() to handle -0.0f == 0.0f properly - assertEquals(FloatComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); - assertEquals(FloatComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertFloatVectorColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableFloat min = new MutableFloat(NULL_FLOAT); - MutableFloat max = new MutableFloat(NULL_FLOAT); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final float value : values) { - itemCount.increment(); - if (value == NULL_FLOAT) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_FLOAT || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_FLOAT || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - // Use FloatComparisons.compare() to handle -0.0f == 0.0f properly - assertEquals(FloatComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); - assertEquals(FloatComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertDoubleColumnStatistics(SerialDoubleColumnIterator iterator, Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableDouble min = new MutableDouble(NULL_DOUBLE); - MutableDouble max = new MutableDouble(NULL_DOUBLE); - - iterator.forEachRemaining((DoubleConsumer) value -> { - itemCount.increment(); - if (value == NULL_DOUBLE) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_DOUBLE || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_DOUBLE || value > max.getValue()) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - // Use DoubleComparisons.compare() to handle -0.0f == 0.0f properly - assertEquals(DoubleComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); - assertEquals(DoubleComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertDoubleArrayColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableDouble min = new MutableDouble(NULL_DOUBLE); - MutableDouble max = new MutableDouble(NULL_DOUBLE); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final double value : values) { - itemCount.increment(); - if (value == NULL_DOUBLE) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_DOUBLE || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_DOUBLE || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - // Use DoubleComparisons.compare() to handle -0.0f == 0.0f properly - assertEquals(DoubleComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); - assertEquals(DoubleComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertDoubleVectorColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableDouble min = new MutableDouble(NULL_DOUBLE); - MutableDouble max = new MutableDouble(NULL_DOUBLE); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final double value : values) { - itemCount.increment(); - if (value == NULL_DOUBLE) { - nullCount.increment(); - } else { - if (min.getValue() == NULL_DOUBLE || value < min.getValue()) { - min.setValue(value); - } - if (max.getValue() == NULL_DOUBLE || value > max.getValue()) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - // Use DoubleComparisons.compare() to handle -0.0f == 0.0f properly - assertEquals(DoubleComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); - assertEquals(DoubleComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertStringColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableObject min = new MutableObject<>(null); - MutableObject max = new MutableObject<>(null); - - iterator.forEachRemaining((value) -> { - itemCount.increment(); - if (value == null) { - nullCount.increment(); - } else { - if (min.getValue() == null || value.compareTo(min.getValue()) < 0) { - min.setValue(value); - } - if (max.getValue() == null || value.compareTo(max.getValue()) > 0) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(Binary.fromString(min.getValue()), statistics.genericGetMin()); - assertEquals(Binary.fromString(max.getValue()), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertStringArrayColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableObject min = new MutableObject<>(null); - MutableObject max = new MutableObject<>(null); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final String value : values) { - itemCount.increment(); - if (value == null) { - nullCount.increment(); - } else { - if (min.getValue() == null || value.compareTo(min.getValue()) < 0) { - min.setValue(value); - } - if (max.getValue() == null || value.compareTo(max.getValue()) > 0) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(Binary.fromString(min.getValue()), statistics.genericGetMin()); - assertEquals(Binary.fromString(max.getValue()), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertStringVectorColumnStatistics(SerialObjectColumnIterator> iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableObject min = new MutableObject<>(null); - MutableObject max = new MutableObject<>(null); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (String value : values) { - itemCount.increment(); - if (value == null) { - nullCount.increment(); - } else { - if (min.getValue() == null || value.compareTo(min.getValue()) < 0) { - min.setValue(value); - } - if (max.getValue() == null || value.compareTo(max.getValue()) > 0) { - max.setValue(value); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(Binary.fromString(min.getValue()), statistics.genericGetMin()); - assertEquals(Binary.fromString(max.getValue()), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertInstantColumnStatistic(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableLong min = new MutableLong(NULL_LONG); - MutableLong max = new MutableLong(NULL_LONG); - - iterator.forEachRemaining((value) -> { - itemCount.increment(); - if (value == null) { - nullCount.increment(); - } else { - // DateTimeUtils.epochNanos() is the correct conversion for Instant to long. - if (min.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) < min.getValue()) { - min.setValue(DateTimeUtils.epochNanos(value)); - } - if (max.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) > max.getValue()) { - max.setValue(DateTimeUtils.epochNanos(value)); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertInstantArrayColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableLong min = new MutableLong(NULL_LONG); - MutableLong max = new MutableLong(NULL_LONG); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (final Instant value : values) { - itemCount.increment(); - if (value == null) { - nullCount.increment(); - } else { - // DateTimeUtils.epochNanos() is the correct conversion for Instant to long. - if (min.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) < min.getValue()) { - min.setValue(DateTimeUtils.epochNanos(value)); - } - if (max.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) > max.getValue()) { - max.setValue(DateTimeUtils.epochNanos(value)); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertInstantVectorColumnStatistics(SerialObjectColumnIterator> iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableLong min = new MutableLong(NULL_LONG); - MutableLong max = new MutableLong(NULL_LONG); - - iterator.forEachRemaining(values -> { - if (values == null) { - itemCount.increment(); - nullCount.increment(); - return; - } - for (Instant value : values) { - itemCount.increment(); - if (value == null) { - nullCount.increment(); - } else { - // DateTimeUtils.epochNanos() is the correct conversion for Instant to long. - if (min.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) < min.getValue()) { - min.setValue(DateTimeUtils.epochNanos(value)); - } - if (max.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) > max.getValue()) { - max.setValue(DateTimeUtils.epochNanos(value)); - } - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed - // values. - assertEquals(min.getValue(), statistics.genericGetMin()); - assertEquals(max.getValue(), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertBigDecimalColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableObject min = new MutableObject<>(null); - MutableObject max = new MutableObject<>(null); - - iterator.forEachRemaining((value) -> { - itemCount.increment(); - if (value == null) { - nullCount.increment(); - } else { - if (min.getValue() == null || value.compareTo(min.getValue()) < 0) { - min.setValue(value); - } - if (max.getValue() == null || value.compareTo(max.getValue()) > 0) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(Binary.fromConstantByteArray(min.getValue().unscaledValue().toByteArray()), - statistics.genericGetMin()); - assertEquals(Binary.fromConstantByteArray(max.getValue().unscaledValue().toByteArray()), - statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } - - private void assertBigIntegerColumnStatistics(SerialObjectColumnIterator iterator, - Statistics statistics) { - MutableLong itemCount = new MutableLong(0); - MutableLong nullCount = new MutableLong(0); - MutableObject min = new MutableObject<>(null); - MutableObject max = new MutableObject<>(null); - - iterator.forEachRemaining((value) -> { - itemCount.increment(); - if (value == null) { - nullCount.increment(); - } else { - if (min.getValue() == null || value.compareTo(min.getValue()) < 0) { - min.setValue(value); - } - if (max.getValue() == null || value.compareTo(max.getValue()) > 0) { - max.setValue(value); - } - } - }); - - assertEquals(nullCount.intValue(), statistics.getNumNulls()); - if (!itemCount.getValue().equals(nullCount.getValue())) { - // There are some non-null values, so min and max should be non-null and equal to observed values. - assertEquals(Binary.fromConstantByteArray(min.getValue().toByteArray()), statistics.genericGetMin()); - assertEquals(Binary.fromConstantByteArray(max.getValue().toByteArray()), statistics.genericGetMax()); - } else { - // Everything is null, statistics should be empty. - assertFalse(statistics.hasNonNullValue()); - } - } + // + // /** + // * These are tests for writing to a table with grouping columns to a parquet file and making sure there are no + // * unnecessary files left in the directory after we finish writing. + // */ + // @Test + // public void groupingColumnsBasicWriteTests() { + // groupingColumnsBasicWriteTestsImpl(SINGLE_WRITER); + // groupingColumnsBasicWriteTestsImpl(MULTI_WRITER); + // } + // + // public void groupingColumnsBasicWriteTestsImpl(TestParquetTableWriter writer) { + // // Create an empty parent directory + // final File parentDir = new File(rootFile, "tempDir"); + // parentDir.mkdir(); + // assertTrue(parentDir.exists() && parentDir.isDirectory() && parentDir.list().length == 0); + // + // Integer data[] = new Integer[500 * 4]; + // for (int i = 0; i < data.length; i++) { + // data[i] = i / 4; + // } + // final TableDefinition tableDefinition = TableDefinition.of(ColumnDefinition.ofInt("vvv").withGrouping()); + // final Table tableToSave = newTable(tableDefinition, TableTools.col("vvv", data)); + // + // final String destFilename = "groupingColumnsWriteTests.parquet"; + // final File destFile = new File(parentDir, destFilename); + // writer.writeTable(tableToSave, destFile); + // String vvvIndexFilePath = ".dh_metadata/indexes/vvv/index_vvv_groupingColumnsWriteTests.parquet"; + // verifyFilesInDir(parentDir, new String[] {destFilename}, Map.of("vvv", new String[] {vvvIndexFilePath})); + // + // checkSingleTable(tableToSave, destFile); + // + // // Verify that the key-value metadata in the file has the correct name + // ParquetTableLocationKey tableLocationKey = + // new ParquetTableLocationKey(destFile, 0, null, ParquetInstructions.EMPTY); + // String metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); + // assertTrue(metadataString.contains(vvvIndexFilePath)); + // + // // Write another table but this write should fail + // final TableDefinition badTableDefinition = TableDefinition.of(ColumnDefinition.ofInt("www").withGrouping()); + // final Table badTable = newTable(badTableDefinition, TableTools.col("www", data)) + // .updateView("InputString = ii % 2 == 0 ? Long.toString(ii) : null", "A=InputString.charAt(0)"); + // try { + // writer.writeTable(badTable, destFile); + // TestCase.fail("Exception expected for invalid formula"); + // } catch (UncheckedDeephavenException e) { + // assertTrue(e.getCause() instanceof FormulaEvaluationException); + // } + // + // // Make sure that original file is preserved and no temporary files + // verifyFilesInDir(parentDir, new String[] {destFilename}, Map.of("vvv", new String[] {vvvIndexFilePath})); + // checkSingleTable(tableToSave, destFile); + // FileUtils.deleteRecursively(parentDir); + // } + // + // @Test + // public void legacyGroupingFileReadTest() { + // final String path = + // ParquetTableReadWriteTest.class.getResource("/ParquetDataWithLegacyGroupingInfo.parquet").getFile(); + // final File destFile = new File(path); + // + // // Read the legacy file and verify that grouping column is read correctly + // final Table fromDisk = readParquetFileFromGitLFS(destFile); + // final String groupingColName = "gcol"; + // assertTrue(fromDisk.getDefinition().getColumn(groupingColName).isGrouping()); + // + // // Verify that the key-value metadata in the file has the correct legacy grouping file name + // final ParquetTableLocationKey tableLocationKey = + // new ParquetTableLocationKey(destFile, 0, null, ParquetInstructions.EMPTY); + // final String metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); + // String groupingFileName = ParquetTools.legacyGroupingFileName(destFile, groupingColName); + // assertTrue(metadataString.contains(groupingFileName)); + // + // // Following is how this file was generated, so verify the table read from disk against this + // Integer data[] = new Integer[500 * 4]; + // for (int i = 0; i < data.length; i++) { + // data[i] = i / 4; + // } + // final TableDefinition tableDefinition = + // TableDefinition.of(ColumnDefinition.ofInt(groupingColName).withGrouping()); + // final Table table = newTable(tableDefinition, TableTools.col(groupingColName, data)); + // assertTableEquals(fromDisk, table); + // } + // + // @Test + // public void parquetDirectoryWithDotFilesTest() throws IOException { + // // Create an empty parent directory + // final File parentDir = new File(rootFile, "tempDir"); + // parentDir.mkdir(); + // assertTrue(parentDir.exists() && parentDir.isDirectory() && parentDir.list().length == 0); + // + // Integer data[] = new Integer[500 * 4]; + // for (int i = 0; i < data.length; i++) { + // data[i] = i / 4; + // } + // final TableDefinition tableDefinition = TableDefinition.of(ColumnDefinition.ofInt("vvv").withGrouping()); + // final Table tableToSave = newTable(tableDefinition, TableTools.col("vvv", data)); + // + // final String destFilename = "data.parquet"; + // final File destFile = new File(parentDir, destFilename); + // writeTable(tableToSave, destFile); + // String vvvIndexFilePath = ".dh_metadata/indexes/vvv/index_vvv_data.parquet"; + // verifyFilesInDir(parentDir, new String[] {destFilename}, Map.of("vvv", new String[] {vvvIndexFilePath})); + // + // // Call readTable on parent directory + // Table fromDisk = readFlatPartitionedTable(parentDir, EMPTY); + // assertTableEquals(fromDisk, tableToSave); + // + // // Add an empty dot file and dot directory (with valid parquet files) in the parent directory + // final File dotFile = new File(parentDir, ".dotFile"); + // assertTrue(dotFile.createNewFile()); + // final File dotDir = new File(parentDir, ".dotDir"); + // assertTrue(dotDir.mkdir()); + // final Table someTable = TableTools.emptyTable(5).update("A=(int)i"); + // writeTable(someTable, new File(dotDir, "data.parquet")); + // fromDisk = readFlatPartitionedTable(parentDir, EMPTY); + // assertTableEquals(fromDisk, tableToSave); + // + // // Add a dot parquet in parent directory + // final Table anotherTable = TableTools.emptyTable(5).update("A=(int)i"); + // final File pqDotFile = new File(parentDir, ".dotFile.parquet"); + // writeTable(anotherTable, pqDotFile); + // fromDisk = readFlatPartitionedTable(parentDir, EMPTY); + // assertTableEquals(fromDisk, tableToSave); + // } + // + // @Test + // public void partitionedParquetWithDotFilesTest() throws IOException { + // // Create an empty parent directory + // final File parentDir = new File(rootFile, "tempDir"); + // parentDir.mkdir(); + // assertTrue(parentDir.exists() && parentDir.isDirectory() && parentDir.list().length == 0); + // + // final Table someTable = TableTools.emptyTable(5).update("A=(int)i"); + // final File firstPartition = new File(parentDir, "X=A"); + // final File firstDataFile = new File(firstPartition, "data.parquet"); + // final File secondPartition = new File(parentDir, "X=B"); + // final File secondDataFile = new File(secondPartition, "data.parquet"); + // + // writeTable(someTable, firstDataFile); + // writeTable(someTable, secondDataFile); + // + // Table partitionedTable = readKeyValuePartitionedTable(parentDir, EMPTY).select(); + // final Set columnsSet = partitionedTable.getDefinition().getColumnNameSet(); + // assertTrue(columnsSet.size() == 2 && columnsSet.contains("A") && columnsSet.contains("X")); + // + // // Add an empty dot file and dot directory (with valid parquet files) in one of the partitions + // final File dotFile = new File(firstPartition, ".dotFile"); + // assertTrue(dotFile.createNewFile()); + // final File dotDir = new File(firstPartition, ".dotDir"); + // assertTrue(dotDir.mkdir()); + // writeTable(someTable, new File(dotDir, "data.parquet")); + // Table fromDisk = readKeyValuePartitionedTable(parentDir, EMPTY); + // assertTableEquals(fromDisk, partitionedTable); + // + // // Add a dot parquet file in one of the partitions directory + // final Table anotherTable = TableTools.emptyTable(5).update("B=(int)i"); + // final File pqDotFile = new File(secondPartition, ".dotFile.parquet"); + // writeTable(anotherTable, pqDotFile); + // fromDisk = readKeyValuePartitionedTable(parentDir, EMPTY); + // assertTableEquals(fromDisk, partitionedTable); + // } + // + // /** + // * These are tests for writing multiple parquet tables with grouping columns. + // */ + // @Test + // public void writeMultiTableGroupingColumnTest() { + // // Create an empty parent directory + // final File parentDir = new File(rootFile, "tempDir"); + // parentDir.mkdir(); + // + // Integer data[] = new Integer[500 * 4]; + // for (int i = 0; i < data.length; i++) { + // data[i] = i / 4; + // } + // final TableDefinition tableDefinition = TableDefinition.of(ColumnDefinition.ofInt("vvv").withGrouping()); + // final Table firstTable = newTable(tableDefinition, TableTools.col("vvv", data)); + // final String firstFilename = "firstTable.parquet"; + // final File firstDestFile = new File(parentDir, firstFilename); + // + // final Table secondTable = newTable(tableDefinition, TableTools.col("vvv", data)); + // final String secondFilename = "secondTable.parquet"; + // final File secondDestFile = new File(parentDir, secondFilename); + // + // Table[] tablesToSave = new Table[] {firstTable, secondTable}; + // File[] destFiles = new File[] {firstDestFile, secondDestFile}; + // + // ParquetTools.writeTables(tablesToSave, firstTable.getDefinition(), destFiles); + // + // String firstIndexFilePath = ".dh_metadata/indexes/vvv/index_vvv_firstTable.parquet"; + // String secondIndexFilePath = ".dh_metadata/indexes/vvv/index_vvv_secondTable.parquet"; + // verifyFilesInDir(parentDir, new String[] {firstFilename, secondFilename}, + // Map.of("vvv", new String[] {firstIndexFilePath, secondIndexFilePath})); + // + // // Verify that the key-value metadata in the file has the correct name + // ParquetTableLocationKey tableLocationKey = + // new ParquetTableLocationKey(firstDestFile, 0, null, ParquetInstructions.EMPTY); + // String metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); + // assertTrue(metadataString.contains(firstIndexFilePath)); + // tableLocationKey = new ParquetTableLocationKey(secondDestFile, 0, null, ParquetInstructions.EMPTY); + // metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); + // assertTrue(metadataString.contains(secondIndexFilePath)); + // + // // Read back the files and verify contents match + // checkSingleTable(firstTable, firstDestFile); + // checkSingleTable(secondTable, secondDestFile); + // } + // + // @Test + // public void groupingColumnsOverwritingTests() { + // groupingColumnsOverwritingTestsImpl(SINGLE_WRITER); + // groupingColumnsOverwritingTestsImpl(MULTI_WRITER); + // } + // + // public void groupingColumnsOverwritingTestsImpl(TestParquetTableWriter writer) { + // // Create an empty parent directory + // final File parentDir = new File(rootFile, "tempDir"); + // parentDir.mkdir(); + // assertTrue(parentDir.exists() && parentDir.isDirectory() && parentDir.list().length == 0); + // + // Integer data[] = new Integer[500 * 4]; + // for (int i = 0; i < data.length; i++) { + // data[i] = i / 4; + // } + // final TableDefinition tableDefinition = TableDefinition.of(ColumnDefinition.ofInt("vvv").withGrouping()); + // final Table tableToSave = newTable(tableDefinition, TableTools.col("vvv", data)); + // + // final String destFilename = "groupingColumnsWriteTests.parquet"; + // final File destFile = new File(parentDir, destFilename); + // writer.writeTable(tableToSave, destFile); + // String vvvIndexFilePath = ".dh_metadata/indexes/vvv/index_vvv_groupingColumnsWriteTests.parquet"; + // + // // Write a new table successfully at the same position with different grouping columns + // final TableDefinition anotherTableDefinition = TableDefinition.of(ColumnDefinition.ofInt("xxx").withGrouping()); + // Table anotherTableToSave = newTable(anotherTableDefinition, TableTools.col("xxx", data)); + // writer.writeTable(anotherTableToSave, destFile); + // final String xxxIndexFilePath = ".dh_metadata/indexes/xxx/index_xxx_groupingColumnsWriteTests.parquet"; + // + // // The directory now should contain the updated table, its grouping file for column xxx, and old grouping file + // // for column vvv + // verifyFilesInDir(parentDir, new String[] {destFilename}, + // Map.of("vvv", new String[] {vvvIndexFilePath}, + // "xxx", new String[] {xxxIndexFilePath})); + // + // checkSingleTable(anotherTableToSave, destFile); + // + // ParquetTableLocationKey tableLocationKey = + // new ParquetTableLocationKey(destFile, 0, null, ParquetInstructions.EMPTY); + // String metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); + // assertTrue(metadataString.contains(xxxIndexFilePath) && !metadataString.contains(vvvIndexFilePath)); + // + // // Overwrite the table + // writer.writeTable(anotherTableToSave, destFile); + // + // // The directory should still contain the updated table, its grouping file for column xxx, and old grouping file + // // for column vvv + // final File xxxIndexFile = new File(parentDir, xxxIndexFilePath); + // final File backupXXXIndexFile = ParquetTools.getBackupFile(xxxIndexFile); + // final String backupXXXIndexFileName = backupXXXIndexFile.getName(); + // verifyFilesInDir(parentDir, new String[] {destFilename}, + // Map.of("vvv", new String[] {vvvIndexFilePath}, + // "xxx", new String[] {xxxIndexFilePath})); + // + // tableLocationKey = new ParquetTableLocationKey(destFile, 0, null, ParquetInstructions.EMPTY); + // metadataString = tableLocationKey.getMetadata().getFileMetaData().toString(); + // assertTrue(metadataString.contains(xxxIndexFilePath) && !metadataString.contains(vvvIndexFilePath) + // && !metadataString.contains(backupXXXIndexFileName)); + // FileUtils.deleteRecursively(parentDir); + // } + // + // @Test + // public void readChangedUnderlyingFileTests() { + // readChangedUnderlyingFileTestsImpl(SINGLE_WRITER); + // readChangedUnderlyingFileTestsImpl(MULTI_WRITER); + // } + // + // public void readChangedUnderlyingFileTestsImpl(TestParquetTableWriter writer) { + // // Write a table to parquet file and read it back + // final Table tableToSave = TableTools.emptyTable(5).update("A=(int)i", "B=(long)i", "C=(double)i"); + // final String filename = "readChangedUnderlyingFileTests.parquet"; + // final File destFile = new File(rootFile, filename); + // writer.writeTable(tableToSave, destFile); + // Table fromDisk = readSingleFileTable(destFile, EMPTY); + // // At this point, fromDisk is not fully materialized in the memory and would be read from the file on demand + // + // // Change the underlying file + // final Table stringTable = TableTools.emptyTable(5).update("InputString = Long.toString(ii)"); + // writer.writeTable(stringTable, destFile); + // Table stringFromDisk = readSingleFileTable(destFile, EMPTY).select(); + // assertTableEquals(stringTable, stringFromDisk); + // + // // Close all the file handles so that next time when fromDisk is accessed, we need to reopen the file handle + // TrackedFileHandleFactory.getInstance().closeAll(); + // + // // Read back fromDisk. Since the underlying file has changed, we expect this to fail. + // try { + // fromDisk.coalesce(); + // TestCase.fail("Expected TableDataException"); + // } catch (TableDataException ignored) { + // // expected + // } + // } + // + // @Test + // public void readModifyWriteTests() { + // readModifyWriteTestsImpl(SINGLE_WRITER); + // readModifyWriteTestsImpl(MULTI_WRITER); + // } + // + // public void readModifyWriteTestsImpl(TestParquetTableWriter writer) { + // // Write a table to parquet file and read it back + // final Table tableToSave = TableTools.emptyTable(5).update("A=(int)i", "B=(long)i", "C=(double)i"); + // final String filename = "readModifyWriteTests.parquet"; + // final File destFile = new File(rootFile, filename); + // writer.writeTable(tableToSave, destFile); + // Table fromDisk = readSingleFileTable(destFile, EMPTY); + // // At this point, fromDisk is not fully materialized in the memory and would be read from the file on demand + // + // // Create a view table on fromDisk which should fail on writing, and try to write at the same location + // // Since we are doing a view() operation and adding a new column and overwriting an existing column, the table + // // won't be materialized in memory or cache. + // final Table badTable = + // fromDisk.view("InputString = ii % 2 == 0 ? Long.toString(ii) : null", "A=InputString.charAt(0)"); + // try { + // writer.writeTable(badTable, destFile); + // TestCase.fail(); + // } catch (UncheckedDeephavenException e) { + // assertTrue(e.getCause() instanceof FormulaEvaluationException); + // } + // + // // Close all old file handles so that we read the file path fresh instead of using any old handles + // TrackedFileHandleFactory.getInstance().closeAll(); + // + // // Read back fromDisk and compare it with original table. If the underlying file has not been corrupted or + // // swapped out, then we would not be able to read from the file + // assertTableEquals(tableToSave, fromDisk); + // } + // + // @Test + // public void dictionaryEncodingTest() { + // Collection columns = new ArrayList<>(Arrays.asList( + // "shortStringColumn = `Row ` + i", + // "longStringColumn = `This is row ` + i", + // "someIntColumn = i")); + // final int numRows = 10; + // final ParquetInstructions writeInstructions = new ParquetInstructions.Builder() + // .setMaximumDictionarySize(100) // Force "longStringColumn" to use non-dictionary encoding + // .build(); + // final Table stringTable = TableTools.emptyTable(numRows).select(Selectable.from(columns)); + // final File dest = new File(rootFile + File.separator + "dictEncoding.parquet"); + // writeTable(stringTable, dest, writeInstructions); + // checkSingleTable(stringTable, dest); + // + // // Verify that string columns are properly dictionary encoded + // final ParquetMetadata metadata = + // new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); + // final String firstColumnMetadata = metadata.getBlocks().get(0).getColumns().get(0).toString(); + // assertTrue(firstColumnMetadata.contains("shortStringColumn") && firstColumnMetadata.contains("RLE_DICTIONARY")); + // final String secondColumnMetadata = metadata.getBlocks().get(0).getColumns().get(1).toString(); + // assertTrue( + // secondColumnMetadata.contains("longStringColumn") && !secondColumnMetadata.contains("RLE_DICTIONARY")); + // final String thirdColumnMetadata = metadata.getBlocks().get(0).getColumns().get(2).toString(); + // assertTrue(thirdColumnMetadata.contains("someIntColumn") && !thirdColumnMetadata.contains("RLE_DICTIONARY")); + // } + // + // @Test + // public void overflowingStringsTest() { + // // Test the behavior of writing parquet files if entries exceed the page size limit + // final int pageSize = ParquetInstructions.MIN_TARGET_PAGE_SIZE; + // final char[] data = new char[pageSize / 4]; + // String someString = new String(data); + // Collection columns = new ArrayList<>(Arrays.asList( + // "someStringColumn = `" + someString + "` + i%10")); + // final long numRows = 10; + // ColumnChunkMetaData columnMetadata = overflowingStringsTestHelper(columns, numRows, pageSize); + // String metadataStr = columnMetadata.toString(); + // assertTrue(metadataStr.contains("someStringColumn") && metadataStr.contains("PLAIN") + // && !metadataStr.contains("RLE_DICTIONARY")); + // + // // We exceed page size on hitting 4 rows, and we have 10 total rows. + // // Therefore, we should have total 4 pages containing 3, 3, 3, 1 rows respectively. + // assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 4); + // + // final char[] veryLongData = new char[pageSize]; + // someString = new String(veryLongData); + // columns = new ArrayList<>( + // Arrays.asList("someStringColumn = ii % 2 == 0 ? Long.toString(ii) : `" + someString + "` + ii")); + // columnMetadata = overflowingStringsTestHelper(columns, numRows, pageSize); + // // We will have 10 pages each containing 1 row. + // assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 10); + // + // // Table with rows of null alternating with strings exceeding the page size + // columns = new ArrayList<>(Arrays.asList("someStringColumn = ii % 2 == 0 ? null : `" + someString + "` + ii")); + // columnMetadata = overflowingStringsTestHelper(columns, numRows, pageSize); + // // We will have 6 pages containing 1, 2, 2, 2, 2, 1 rows. + // assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 6); + // } + // + // private static ColumnChunkMetaData overflowingStringsTestHelper(final Collection columns, + // final long numRows, final int pageSize) { + // final ParquetInstructions writeInstructions = new ParquetInstructions.Builder() + // .setTargetPageSize(pageSize) // Force a small page size to cause splitting across pages + // .setMaximumDictionarySize(50) // Force "someStringColumn" to use non-dictionary encoding + // .build(); + // Table stringTable = TableTools.emptyTable(numRows).select(Selectable.from(columns)); + // final File dest = new File(rootFile + File.separator + "overflowingStringsTest.parquet"); + // writeTable(stringTable, dest, writeInstructions); + // checkSingleTable(stringTable, dest); + // + // ParquetMetadata metadata = new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); + // ColumnChunkMetaData columnMetadata = metadata.getBlocks().get(0).getColumns().get(0); + // return columnMetadata; + // } + // + // @Test + // public void overflowingCodecsTest() { + // final int pageSize = ParquetInstructions.MIN_TARGET_PAGE_SIZE; + // final ParquetInstructions writeInstructions = new ParquetInstructions.Builder() + // .setTargetPageSize(pageSize) // Force a small page size to cause splitting across pages + // .addColumnCodec("VariableWidthByteArrayColumn", SimpleByteArrayCodec.class.getName()) + // .build(); + // + // final ColumnDefinition columnDefinition = + // ColumnDefinition.fromGenericType("VariableWidthByteArrayColumn", byte[].class, byte.class); + // final TableDefinition tableDefinition = TableDefinition.of(columnDefinition); + // final byte[] byteArray = new byte[pageSize / 2]; + // final Table table = newTable(tableDefinition, + // TableTools.col("VariableWidthByteArrayColumn", byteArray, byteArray, byteArray)); + // + // final File dest = new File(rootFile + File.separator + "overflowingCodecsTest.parquet"); + // writeTable(table, dest, writeInstructions); + // checkSingleTable(table, dest); + // + // final ParquetMetadata metadata = + // new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); + // final String metadataStr = metadata.getFileMetaData().getKeyValueMetaData().get("deephaven"); + // assertTrue( + // metadataStr.contains("VariableWidthByteArrayColumn") && metadataStr.contains("SimpleByteArrayCodec")); + // final ColumnChunkMetaData columnMetadata = metadata.getBlocks().get(0).getColumns().get(0); + // final String columnMetadataStr = columnMetadata.toString(); + // assertTrue(columnMetadataStr.contains("VariableWidthByteArrayColumn") && columnMetadataStr.contains("PLAIN")); + // // Each byte array is of half the page size. So we exceed page size on hitting 3 byteArrays. + // // Therefore, we should have total 2 pages containing 2, 1 rows respectively. + // assertEquals(columnMetadata.getEncodingStats().getNumDataPagesEncodedAs(Encoding.PLAIN), 2); + // } + // + // @Test + // public void readWriteStatisticsTest() { + // // Test simple structured table. + // final ColumnDefinition columnDefinition = + // ColumnDefinition.fromGenericType("VariableWidthByteArrayColumn", byte[].class, byte.class); + // final TableDefinition tableDefinition = TableDefinition.of(columnDefinition); + // final byte[] byteArray = new byte[] {1, 2, 3, 4, NULL_BYTE, 6, 7, 8, 9, NULL_BYTE, 11, 12, 13}; + // final Table simpleTable = newTable(tableDefinition, + // TableTools.col("VariableWidthByteArrayColumn", null, byteArray, byteArray, byteArray, byteArray, + // byteArray)); + // final File simpleTableDest = new File(rootFile, "ParquetTest_simple_statistics_test.parquet"); + // writeTable(simpleTable, simpleTableDest); + // + // checkSingleTable(simpleTable, simpleTableDest); + // + // assertTableStatistics(simpleTable, simpleTableDest); + // + // // Test flat columns. + // final Table flatTableToSave = getTableFlat(10_000, true, true); + // final File flatTableDest = new File(rootFile, "ParquetTest_flat_statistics_test.parquet"); + // writeTable(flatTableToSave, flatTableDest); + // + // checkSingleTable(maybeFixBigDecimal(flatTableToSave), flatTableDest); + // + // assertTableStatistics(flatTableToSave, flatTableDest); + // + // // Test nested columns. + // final Table groupedTableToSave = getGroupedTable(10_000, true); + // final File groupedTableDest = new File(rootFile, "ParquetTest_grouped_statistics_test.parquet"); + // writeTable(groupedTableToSave, groupedTableDest, groupedTableToSave.getDefinition()); + // + // checkSingleTable(groupedTableToSave, groupedTableDest); + // + // assertTableStatistics(groupedTableToSave, groupedTableDest); + // } + // + // @Test + // public void readWriteDateTimeTest() { + // final int NUM_ROWS = 1000; + // final Table table = TableTools.emptyTable(NUM_ROWS).view( + // "someDateColumn = java.time.LocalDate.ofEpochDay(i)", + // "someTimeColumn = java.time.LocalTime.of(i%24, i%60, (i+10)%60)", + // "someLocalDateTimeColumn = java.time.LocalDateTime.of(2000+i%10, i%12+1, i%30+1, (i+4)%24, (i+5)%60, (i+6)%60, + // i)", + // "someInstantColumn = DateTimeUtils.now() + i").select(); + // final File dest = new File(rootFile, "readWriteDateTimeTest.parquet"); + // writeReadTableTest(table, dest); + // + // // Verify that the types are correct in the schema + // final ParquetMetadata metadata = + // new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); + // final ColumnChunkMetaData dateColMetadata = metadata.getBlocks().get(0).getColumns().get(0); + // assertTrue(dateColMetadata.toString().contains("someDateColumn")); + // assertEquals(PrimitiveType.PrimitiveTypeName.INT32, dateColMetadata.getPrimitiveType().getPrimitiveTypeName()); + // assertEquals(LogicalTypeAnnotation.dateType(), dateColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); + // + // final ColumnChunkMetaData timeColMetadata = metadata.getBlocks().get(0).getColumns().get(1); + // assertTrue(timeColMetadata.toString().contains("someTimeColumn")); + // assertEquals(PrimitiveType.PrimitiveTypeName.INT64, timeColMetadata.getPrimitiveType().getPrimitiveTypeName()); + // assertEquals(LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.NANOS), + // timeColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); + // + // final ColumnChunkMetaData localDateTimeColMetadata = metadata.getBlocks().get(0).getColumns().get(2); + // assertTrue(localDateTimeColMetadata.toString().contains("someLocalDateTimeColumn")); + // assertEquals(PrimitiveType.PrimitiveTypeName.INT64, + // localDateTimeColMetadata.getPrimitiveType().getPrimitiveTypeName()); + // assertEquals(LogicalTypeAnnotation.timestampType(false, LogicalTypeAnnotation.TimeUnit.NANOS), + // localDateTimeColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); + // + // final ColumnChunkMetaData instantColMetadata = metadata.getBlocks().get(0).getColumns().get(3); + // assertTrue(instantColMetadata.toString().contains("someInstantColumn")); + // assertEquals(PrimitiveType.PrimitiveTypeName.INT64, + // instantColMetadata.getPrimitiveType().getPrimitiveTypeName()); + // assertEquals(LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.NANOS), + // instantColMetadata.getPrimitiveType().getLogicalTypeAnnotation()); + // } + // + // /** + // * Test our manual verification techniques against a file generated by pyarrow. Here is the code to produce the + // file + // * when/if this file needs to be re-generated or changed. + // * + // *
+    // * ###############################################################################
+    // * import pyarrow.parquet
+    // *
+    // * pa_table = pyarrow.table({
+    // * 'int': [0, None, 100, -100],
+    // * 'float': [0.0, None, 100.0, -100.0],
+    // * 'string': ["aaa", None, "111", "ZZZ"],
+    // * 'intList': [
+    // * [0, None, 2],
+    // * None,
+    // * [3, 4, 6, 7, 8, 9, 10, 100, -100],
+    // * [5]
+    // * ],
+    // * 'floatList': [
+    // * [0.0, None, 2.0],
+    // * None,
+    // * [3.0, 4.0, 6.0, 7.0, 8.0, 9.0, 10.0, 100.0, -100.0],
+    // * [5.0]
+    // * ],
+    // * 'stringList': [
+    // * ["aaa", None, None],
+    // * None,
+    // * ["111", "zzz", "ZZZ", "AAA"],
+    // * ["ccc"]
+    // * ]})
+    // * pyarrow.parquet.write_table(pa_table, './extensions/parquet/table/src/test/resources/e0/pyarrow_stats.parquet')
+    // * ###############################################################################
+    // * 
+ // */ + // @Test + // public void verifyPyArrowStatistics() { + // final String path = ParquetTableReadWriteTest.class.getResource("/e0/pyarrow_stats.parquet").getFile(); + // final File pyarrowDest = new File(path); + // final Table pyarrowFromDisk = readParquetFileFromGitLFS(pyarrowDest); + // + // // Verify that our verification code works for a pyarrow generated table. + // assertTableStatistics(pyarrowFromDisk, pyarrowDest); + // + // // Write the table to disk using our code. + // final File dhDest = new File(rootFile, "ParquetTest_statistics_test.parquet"); + // writeTable(pyarrowFromDisk, dhDest); + // + // final Table dhFromDisk = checkSingleTable(pyarrowFromDisk, dhDest); + // + // // Run the verification code against DHC writer stats. + // assertTableStatistics(pyarrowFromDisk, dhDest); + // assertTableStatistics(dhFromDisk, dhDest); + // } + // + // @Test + // public void singleTable() { + // final File fooSource = new File(rootFile, "singleTable/foo.parquet"); + // final File fooBarSource = new File(rootFile, "singleTable/fooBar.parquet"); + // final File barSource = new File(rootFile, "singleTable/bar.parquet"); + // + // final Table foo; + // final Table fooBar; + // final Table bar; + // final Table fooBarNullFoo; + // final Table fooBarNullBar; + // + // final TableDefinition fooDefinition; + // final TableDefinition fooBarDefinition; + // final TableDefinition barDefinition; + // { + // fooSource.mkdirs(); + // fooBarSource.mkdirs(); + // barSource.mkdirs(); + // + // final ColumnHolder fooCol = intCol("Foo", 1, 2, 3); + // final ColumnHolder barCol = stringCol("Bar", "Zip", "Zap", "Zoom"); + // + // final ColumnHolder nullFooCol = + // intCol("Foo", QueryConstants.NULL_INT, QueryConstants.NULL_INT, QueryConstants.NULL_INT); + // final ColumnHolder nullBarCol = stringCol("Bar", null, null, null); + // + // final ColumnDefinition fooColDef = ColumnDefinition.ofInt("Foo"); + // final ColumnDefinition barColDef = ColumnDefinition.ofString("Bar"); + // + // fooDefinition = TableDefinition.of(fooColDef); + // fooBarDefinition = TableDefinition.of(fooColDef, barColDef); + // barDefinition = TableDefinition.of(barColDef); + // + // foo = newTable(fooDefinition, fooCol); + // fooBar = newTable(fooBarDefinition, fooCol, barCol); + // bar = newTable(barDefinition, barCol); + // + // fooBarNullFoo = newTable(fooBarDefinition, nullFooCol, barCol); + // fooBarNullBar = newTable(fooBarDefinition, fooCol, nullBarCol); + // + // writeTable(foo, fooSource); + // writeTable(fooBar, fooBarSource); + // writeTable(bar, barSource); + // } + // + // // Infer + // { + // checkSingleTable(foo, fooSource); + // checkSingleTable(fooBar, fooBarSource); + // checkSingleTable(bar, barSource); + // } + // + // // readTable inference to readSingleTable + // { + // assertTableEquals(foo, readTable(fooSource)); + // assertTableEquals(fooBar, readTable(fooBarSource)); + // assertTableEquals(bar, readTable(barSource)); + // } + // + // // Explicit + // { + // assertTableEquals(foo, readSingleFileTable(fooSource, EMPTY, fooDefinition)); + // assertTableEquals(fooBar, readSingleFileTable(fooBarSource, EMPTY, fooBarDefinition)); + // assertTableEquals(bar, readSingleFileTable(barSource, EMPTY, barDefinition)); + // } + // + // // Explicit subset + // { + // // fooBar as foo + // assertTableEquals(foo, readSingleFileTable(fooBarSource, EMPTY, fooDefinition)); + // // fooBar as bar + // assertTableEquals(bar, readSingleFileTable(fooBarSource, EMPTY, barDefinition)); + // } + // + // // Explicit superset + // { + // // foo as fooBar + // assertTableEquals(fooBarNullBar, readSingleFileTable(fooSource, EMPTY, fooBarDefinition)); + // // bar as fooBar + // assertTableEquals(fooBarNullFoo, readSingleFileTable(barSource, EMPTY, fooBarDefinition)); + // } + // + // // No refreshing single table support + // { + // try { + // readSingleFileTable(fooSource, REFRESHING); + // fail("Expected IllegalArgumentException"); + // } catch (IllegalArgumentException e) { + // assertEquals("Unable to have a refreshing single parquet file", e.getMessage()); + // } + // + // try { + // readSingleFileTable(fooSource, REFRESHING, fooDefinition); + // fail("Expected IllegalArgumentException"); + // } catch (IllegalArgumentException e) { + // assertEquals("Unable to have a refreshing single parquet file", e.getMessage()); + // } + // } + // } + // + // @Test + // public void flatPartitionedTable() { + // // Create an empty parent directory + // final File source = new File(rootFile, "flatPartitionedTable/source"); + // final File emptySource = new File(rootFile, "flatPartitionedTable/emptySource"); + // + // final Table formerData; + // final Table latterData; + // final TableDefinition formerDefinition; + // final TableDefinition latterDefinition; + // final Runnable writeIntoEmptySource; + // { + // final File p1File = new File(source, "01.parquet"); + // final File p2File = new File(source, "02.parquet"); + // + // final File p1FileEmpty = new File(emptySource, "01.parquet"); + // final File p2FileEmpty = new File(emptySource, "02.parquet"); + // + // p1File.mkdirs(); + // p2File.mkdirs(); + // emptySource.mkdirs(); + // + // final ColumnHolder foo1 = intCol("Foo", 1, 2, 3); + // final ColumnHolder foo2 = intCol("Foo", 4, 5); + // + // final ColumnHolder bar1 = stringCol("Bar", null, null, null); + // final ColumnHolder bar2 = stringCol("Bar", "Zip", "Zap"); + // + // final Table p1 = newTable(foo1); + // final Table p2 = newTable(foo2, bar2); + // writeTable(p1, p1File); + // writeTable(p2, p2File); + // writeIntoEmptySource = () -> { + // p1FileEmpty.mkdirs(); + // p2FileEmpty.mkdirs(); + // writeTable(p1, p1FileEmpty); + // writeTable(p2, p2FileEmpty); + // }; + // + // final ColumnDefinition foo = ColumnDefinition.ofInt("Foo"); + // final ColumnDefinition bar = ColumnDefinition.ofString("Bar"); + // + // formerDefinition = TableDefinition.of(foo); + // latterDefinition = TableDefinition.of(foo, bar); + // + // formerData = merge( + // newTable(formerDefinition, foo1), + // newTable(formerDefinition, foo2)); + // latterData = merge( + // newTable(latterDefinition, foo1, bar1), + // newTable(latterDefinition, foo2, bar2)); + // } + // + // // Infer from last key + // { + // final Table table = readFlatPartitionedTable(source, EMPTY); + // assertTableEquals(latterData, table); + // } + // // Infer from last key, refreshing + // { + // final Table table = readFlatPartitionedTable(source, REFRESHING); + // assertTableEquals(latterData, table); + // } + // // readTable inference to readFlatPartitionedTable + // { + // assertTableEquals(latterData, readTable(source)); + // } + // + // // Explicit latter definition + // { + // final Table table = readFlatPartitionedTable(source, EMPTY, latterDefinition); + // assertTableEquals(latterData, table); + // } + // // Explicit latter definition, refreshing + // { + // final Table table = readFlatPartitionedTable(source, REFRESHING, latterDefinition); + // assertTableEquals(latterData, table); + // } + // + // // Explicit former definition + // { + // final Table table = readFlatPartitionedTable(source, EMPTY, formerDefinition); + // assertTableEquals(formerData, table); + // } + // // Explicit former definition, refreshing + // { + // final Table table = readFlatPartitionedTable(source, REFRESHING, formerDefinition); + // assertTableEquals(formerData, table); + // } + // + // // Explicit definition, empty directory + // { + // final Table table = readFlatPartitionedTable(emptySource, EMPTY, latterDefinition); + // assertTableEquals(TableTools.newTable(latterDefinition), table); + // } + // // Explicit definition, empty directory, refreshing with new data added + // { + // final Table table = readFlatPartitionedTable(emptySource, REFRESHING, latterDefinition); + // assertTableEquals(TableTools.newTable(latterDefinition), table); + // + // writeIntoEmptySource.run(); + // ExecutionContext.getContext().getUpdateGraph().cast().runWithinUnitTestCycle(() -> { + // // This is not generally a good way to do this sort of testing. Ideally, we'd be a bit smarter and use + // // a test-driven TableDataRefreshService.getSharedRefreshService. + // ((SourceTable) table).tableLocationProvider().refresh(); + // ((SourceTable) table).refresh(); + // assertTableEquals(latterData, table); + // }); + // } + // } + // + // @Test + // public void keyValuePartitionedTable() { + // final File source = new File(rootFile, "keyValuePartitionedTable/source"); + // final File emptySource = new File(rootFile, "keyValuePartitionedTable/emptySource"); + // + // final Table formerData; + // final Table latterData; + // final TableDefinition formerDefinition; + // final TableDefinition latterDefinition; + // final Runnable writeIntoEmptySource; + // { + // final File p1File = new File(source, "Partition=1/z.parquet"); + // final File p2File = new File(source, "Partition=2/a.parquet"); + // + // final File p1FileEmpty = new File(emptySource, "Partition=1/z.parquet"); + // final File p2FileEmpty = new File(emptySource, "Partition=2/a.parquet"); + // + // p1File.mkdirs(); + // p2File.mkdirs(); + // emptySource.mkdirs(); + // + // final ColumnHolder part1 = intCol("Partition", 1, 1, 1); + // final ColumnHolder part2 = intCol("Partition", 2, 2); + // + // final ColumnHolder foo1 = intCol("Foo", 1, 2, 3); + // final ColumnHolder foo2 = intCol("Foo", 4, 5); + // + // final ColumnHolder bar1 = stringCol("Bar", null, null, null); + // final ColumnHolder bar2 = stringCol("Bar", "Zip", "Zap"); + // + // final Table p1 = newTable(foo1); + // final Table p2 = newTable(foo2, bar2); + // writeTable(p1, p1File); + // writeTable(p2, p2File); + // writeIntoEmptySource = () -> { + // p1FileEmpty.mkdirs(); + // p2FileEmpty.mkdirs(); + // writeTable(p1, p1FileEmpty); + // writeTable(p2, p2FileEmpty); + // }; + // + // // Need to be explicit w/ definition so partitioning column applied to expected tables + // final ColumnDefinition partition = ColumnDefinition.ofInt("Partition").withPartitioning(); + // final ColumnDefinition foo = ColumnDefinition.ofInt("Foo"); + // final ColumnDefinition bar = ColumnDefinition.ofString("Bar"); + // + // // Note: merge does not preserve partition column designation, so we need to explicitly create them + // formerDefinition = TableDefinition.of(partition, foo); + // latterDefinition = TableDefinition.of(partition, foo, bar); + // + // formerData = merge( + // newTable(formerDefinition, part1, foo1), + // newTable(formerDefinition, part2, foo2)); + // latterData = merge( + // newTable(latterDefinition, part1, foo1, bar1), + // newTable(latterDefinition, part2, foo2, bar2)); + // } + // + // // Infer from last key + // { + // final Table table = readKeyValuePartitionedTable(source, EMPTY); + // assertTableEquals(latterData, table); + // } + // // Infer from last key, refreshing + // { + // final Table table = readKeyValuePartitionedTable(source, REFRESHING); + // assertTableEquals(latterData, table); + // } + // // readTable inference readKeyValuePartitionedTable + // { + // assertTableEquals(latterData, readTable(source)); + // } + // + // // Explicit latter definition + // { + // final Table table = readKeyValuePartitionedTable(source, EMPTY, latterDefinition); + // assertTableEquals(latterData, table); + // } + // // Explicit latter definition, refreshing + // { + // final Table table = readKeyValuePartitionedTable(source, REFRESHING, latterDefinition); + // assertTableEquals(latterData, table); + // } + // + // // Explicit former definition + // { + // final Table table = readKeyValuePartitionedTable(source, EMPTY, formerDefinition); + // assertTableEquals(formerData, table); + // } + // // Explicit former definition, refreshing + // { + // final Table table = readKeyValuePartitionedTable(source, REFRESHING, formerDefinition); + // assertTableEquals(formerData, table); + // } + // + // // Explicit definition, empty directory + // { + // final Table table = readKeyValuePartitionedTable(emptySource, EMPTY, latterDefinition); + // assertTableEquals(TableTools.newTable(latterDefinition), table); + // } + // // Explicit definition, empty directory, refreshing with new data added + // { + // final Table table = readKeyValuePartitionedTable(emptySource, REFRESHING, latterDefinition); + // assertTableEquals(TableTools.newTable(latterDefinition), table); + // + // writeIntoEmptySource.run(); + // ExecutionContext.getContext().getUpdateGraph().cast().runWithinUnitTestCycle(() -> { + // // This is not generally a good way to do this sort of testing. Ideally, we'd be a bit smarter and use + // // a test-driven TableDataRefreshService.getSharedRefreshService. + // ((SourceTable) table).tableLocationProvider().refresh(); + // ((SourceTable) table).refresh(); + // assertTableEquals(latterData, table); + // }); + // } + // } + // + // @Test + // public void readSingleColumn() { + // final File file = new File(rootFile, "readSingleColumn.parquet"); + // final Table primitives = newTable( + // booleanCol("Bool", null, true), + // charCol("Char", NULL_CHAR, (char) 42), + // byteCol("Byte", NULL_BYTE, (byte) 42), + // shortCol("Short", NULL_SHORT, (short) 42), + // intCol("Int", NULL_INT, 42), + // longCol("Long", NULL_LONG, 42L), + // floatCol("Float", NULL_FLOAT, 42.0f), + // doubleCol("Double", NULL_DOUBLE, 42.0), + // stringCol("String", null, "42"), + // instantCol("Instant", null, Instant.ofEpochMilli(42))); + // { + // writeTable(primitives, file); + // } + // assertTableEquals( + // primitives.view("Bool"), + // readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofBoolean("Bool")))); + // assertTableEquals( + // primitives.view("Char"), + // readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofChar("Char")))); + // assertTableEquals( + // primitives.view("Byte"), + // readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofByte("Byte")))); + // assertTableEquals( + // primitives.view("Short"), + // readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofShort("Short")))); + // assertTableEquals( + // primitives.view("Int"), + // readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofInt("Int")))); + // assertTableEquals( + // primitives.view("Long"), + // readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofLong("Long")))); + // assertTableEquals( + // primitives.view("Float"), + // readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofFloat("Float")))); + // assertTableEquals( + // primitives.view("Double"), + // readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofDouble("Double")))); + // assertTableEquals( + // primitives.view("String"), + // readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofString("String")))); + // assertTableEquals( + // primitives.view("Instant"), + // readSingleFileTable(file, EMPTY, TableDefinition.of(ColumnDefinition.ofTime("Instant")))); + // } + // + // private void assertTableStatistics(Table inputTable, File dest) { + // // Verify that the columns have the correct statistics. + // final ParquetMetadata metadata = + // new ParquetTableLocationKey(dest, 0, null, ParquetInstructions.EMPTY).getMetadata(); + // + // final String[] colNames = inputTable.getDefinition().getColumnNamesArray(); + // for (int colIdx = 0; colIdx < inputTable.numColumns(); ++colIdx) { + // final String colName = colNames[colIdx]; + // + // final ColumnSource columnSource = inputTable.getColumnSource(colName); + // final ColumnChunkMetaData columnChunkMetaData = metadata.getBlocks().get(0).getColumns().get(colIdx); + // final Statistics statistics = columnChunkMetaData.getStatistics(); + // + // final Class csType = columnSource.getType(); + // + // if (csType == boolean.class || csType == Boolean.class) { + // assertBooleanColumnStatistics( + // new SerialByteColumnIterator( + // ReinterpretUtils.booleanToByteSource((ColumnSource) columnSource), + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == Boolean[].class) { + // assertBooleanArrayColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == byte.class || csType == Byte.class) { + // assertByteColumnStatistics( + // new SerialByteColumnIterator( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == byte[].class) { + // assertByteArrayColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == ByteVector.class) { + // assertByteVectorColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == char.class || csType == Character.class) { + // assertCharColumnStatistics( + // new SerialCharacterColumnIterator( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == char[].class) { + // assertCharArrayColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == CharVector.class) { + // assertCharVectorColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == short.class || csType == Short.class) { + // assertShortColumnStatistics( + // new SerialShortColumnIterator( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == short[].class) { + // assertShortArrayColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == ShortVector.class) { + // assertShortVectorColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == int.class || csType == Integer.class) { + // assertIntColumnStatistics( + // new SerialIntegerColumnIterator( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == int[].class) { + // assertIntArrayColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == IntVector.class) { + // assertIntVectorColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == long.class || csType == Long.class) { + // assertLongColumnStatistics( + // new SerialLongColumnIterator( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == long[].class) { + // assertLongArrayColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == LongVector.class) { + // assertLongVectorColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == float.class || csType == Float.class) { + // assertFloatColumnStatistics( + // new SerialFloatColumnIterator( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == float[].class) { + // assertFloatArrayColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == FloatVector.class) { + // assertFloatVectorColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == double.class || csType == Double.class) { + // assertDoubleColumnStatistics( + // new SerialDoubleColumnIterator( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == double[].class) { + // assertDoubleArrayColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == DoubleVector.class) { + // assertDoubleVectorColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == String.class) { + // assertStringColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == String[].class) { + // assertStringArrayColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == ObjectVector.class && columnSource.getComponentType() == String.class) { + // assertStringVectorColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource>) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == BigInteger.class) { + // assertBigIntegerColumnStatistics( + // new SerialObjectColumnIterator( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == BigDecimal.class) { + // assertBigDecimalColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == Instant.class) { + // assertInstantColumnStatistic( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == Instant[].class) { + // assertInstantArrayColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else if (csType == ObjectVector.class && columnSource.getComponentType() == Instant.class) { + // assertInstantVectorColumnStatistics( + // new SerialObjectColumnIterator<>( + // (ColumnSource>) columnSource, + // inputTable.getRowSet()), + // (Statistics) statistics); + // } else { + // // We can't verify statistics for this column type, so just skip it. + // System.out.println("Ignoring column " + colName + " of type " + csType.getName()); + // } + // } + // } + // + // // region Column Statistics Assertions + // private void assertBooleanColumnStatistics(SerialByteColumnIterator iterator, Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_BYTE); + // MutableInt max = new MutableInt(NULL_BYTE); + // + // iterator.forEachRemaining((ByteConsumer) value -> { + // itemCount.increment(); + // if (value == NULL_BYTE) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_BYTE || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_BYTE || value > max.getValue()) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue() == 1, statistics.genericGetMin()); + // assertEquals(max.getValue() == 1, statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertBooleanArrayColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_BYTE); + // MutableInt max = new MutableInt(NULL_BYTE); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final Boolean value : values) { + // itemCount.increment(); + // if (value == null) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_BYTE || (value ? 1 : 0) < min.getValue()) { + // min.setValue(value ? 1 : 0); + // } + // if (max.getValue() == NULL_BYTE || (value ? 1 : 0) > max.getValue()) { + // max.setValue(value ? 1 : 0); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue() == 1, statistics.genericGetMin()); + // assertEquals(max.getValue() == 1, statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertByteColumnStatistics(SerialByteColumnIterator iterator, Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_BYTE); + // MutableInt max = new MutableInt(NULL_BYTE); + // + // iterator.forEachRemaining((ByteConsumer) value -> { + // itemCount.increment(); + // if (value == NULL_BYTE) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_BYTE || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_BYTE || value > max.getValue()) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertByteArrayColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_BYTE); + // MutableInt max = new MutableInt(NULL_BYTE); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final byte value : values) { + // itemCount.increment(); + // if (value == NULL_BYTE) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_BYTE || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_BYTE || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertByteVectorColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_BYTE); + // MutableInt max = new MutableInt(NULL_BYTE); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final byte value : values) { + // itemCount.increment(); + // if (value == NULL_BYTE) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_BYTE || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_BYTE || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertCharColumnStatistics(SerialCharacterColumnIterator iterator, Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_CHAR); + // MutableInt max = new MutableInt(NULL_CHAR); + // + // iterator.forEachRemaining((CharConsumer) value -> { + // itemCount.increment(); + // if (value == NULL_CHAR) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_CHAR || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_CHAR || value > max.getValue()) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertCharArrayColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_CHAR); + // MutableInt max = new MutableInt(NULL_CHAR); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final char value : values) { + // itemCount.increment(); + // if (value == NULL_CHAR) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_CHAR || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_CHAR || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertCharVectorColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_CHAR); + // MutableInt max = new MutableInt(NULL_CHAR); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final char value : values) { + // itemCount.increment(); + // if (value == NULL_CHAR) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_CHAR || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_CHAR || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertShortColumnStatistics(SerialShortColumnIterator iterator, Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_SHORT); + // MutableInt max = new MutableInt(NULL_SHORT); + // + // iterator.forEachRemaining((ShortConsumer) value -> { + // itemCount.increment(); + // if (value == NULL_SHORT) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_SHORT || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_SHORT || value > max.getValue()) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertShortArrayColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_SHORT); + // MutableInt max = new MutableInt(NULL_SHORT); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final short value : values) { + // itemCount.increment(); + // if (value == NULL_SHORT) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_SHORT || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_SHORT || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertShortVectorColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_SHORT); + // MutableInt max = new MutableInt(NULL_SHORT); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final short value : values) { + // itemCount.increment(); + // if (value == NULL_SHORT) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_SHORT || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_SHORT || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertIntColumnStatistics(SerialIntegerColumnIterator iterator, Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_INT); + // MutableInt max = new MutableInt(NULL_INT); + // + // iterator.forEachRemaining((IntConsumer) value -> { + // itemCount.increment(); + // if (value == NULL_INT) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_INT || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_INT || value > max.getValue()) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertIntArrayColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_INT); + // MutableInt max = new MutableInt(NULL_INT); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final int value : values) { + // itemCount.increment(); + // if (value == NULL_INT) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_INT || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_INT || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertIntVectorColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableInt min = new MutableInt(NULL_INT); + // MutableInt max = new MutableInt(NULL_INT); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final int value : values) { + // itemCount.increment(); + // if (value == NULL_INT) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_INT || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_INT || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertLongColumnStatistics(SerialLongColumnIterator iterator, Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableLong min = new MutableLong(NULL_LONG); + // MutableLong max = new MutableLong(NULL_LONG); + // + // iterator.forEachRemaining((LongConsumer) value -> { + // itemCount.increment(); + // if (value == NULL_LONG) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_LONG || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_LONG || value > max.getValue()) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertLongArrayColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableLong min = new MutableLong(NULL_LONG); + // MutableLong max = new MutableLong(NULL_LONG); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final long value : values) { + // itemCount.increment(); + // if (value == NULL_LONG) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_LONG || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_LONG || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertLongVectorColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableLong min = new MutableLong(NULL_LONG); + // MutableLong max = new MutableLong(NULL_LONG); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final long value : values) { + // itemCount.increment(); + // if (value == NULL_LONG) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_LONG || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_LONG || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertFloatColumnStatistics(SerialFloatColumnIterator iterator, Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableFloat min = new MutableFloat(NULL_FLOAT); + // MutableFloat max = new MutableFloat(NULL_FLOAT); + // + // iterator.forEachRemaining((FloatConsumer) value -> { + // itemCount.increment(); + // if (value == NULL_FLOAT) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_FLOAT || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_FLOAT || value > max.getValue()) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // // Use FloatComparisons.compare() to handle -0.0f == 0.0f properly + // assertEquals(FloatComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); + // assertEquals(FloatComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertFloatArrayColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableFloat min = new MutableFloat(NULL_FLOAT); + // MutableFloat max = new MutableFloat(NULL_FLOAT); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final float value : values) { + // itemCount.increment(); + // if (value == NULL_FLOAT) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_FLOAT || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_FLOAT || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // // Use FloatComparisons.compare() to handle -0.0f == 0.0f properly + // assertEquals(FloatComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); + // assertEquals(FloatComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertFloatVectorColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableFloat min = new MutableFloat(NULL_FLOAT); + // MutableFloat max = new MutableFloat(NULL_FLOAT); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final float value : values) { + // itemCount.increment(); + // if (value == NULL_FLOAT) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_FLOAT || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_FLOAT || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // // Use FloatComparisons.compare() to handle -0.0f == 0.0f properly + // assertEquals(FloatComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); + // assertEquals(FloatComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertDoubleColumnStatistics(SerialDoubleColumnIterator iterator, Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableDouble min = new MutableDouble(NULL_DOUBLE); + // MutableDouble max = new MutableDouble(NULL_DOUBLE); + // + // iterator.forEachRemaining((DoubleConsumer) value -> { + // itemCount.increment(); + // if (value == NULL_DOUBLE) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_DOUBLE || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_DOUBLE || value > max.getValue()) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // // Use DoubleComparisons.compare() to handle -0.0f == 0.0f properly + // assertEquals(DoubleComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); + // assertEquals(DoubleComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertDoubleArrayColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableDouble min = new MutableDouble(NULL_DOUBLE); + // MutableDouble max = new MutableDouble(NULL_DOUBLE); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final double value : values) { + // itemCount.increment(); + // if (value == NULL_DOUBLE) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_DOUBLE || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_DOUBLE || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // // Use DoubleComparisons.compare() to handle -0.0f == 0.0f properly + // assertEquals(DoubleComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); + // assertEquals(DoubleComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertDoubleVectorColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableDouble min = new MutableDouble(NULL_DOUBLE); + // MutableDouble max = new MutableDouble(NULL_DOUBLE); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final double value : values) { + // itemCount.increment(); + // if (value == NULL_DOUBLE) { + // nullCount.increment(); + // } else { + // if (min.getValue() == NULL_DOUBLE || value < min.getValue()) { + // min.setValue(value); + // } + // if (max.getValue() == NULL_DOUBLE || value > max.getValue()) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // // Use DoubleComparisons.compare() to handle -0.0f == 0.0f properly + // assertEquals(DoubleComparisons.compare(min.getValue(), statistics.genericGetMin()), 0); + // assertEquals(DoubleComparisons.compare(max.getValue(), statistics.genericGetMax()), 0); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertStringColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableObject min = new MutableObject<>(null); + // MutableObject max = new MutableObject<>(null); + // + // iterator.forEachRemaining((value) -> { + // itemCount.increment(); + // if (value == null) { + // nullCount.increment(); + // } else { + // if (min.getValue() == null || value.compareTo(min.getValue()) < 0) { + // min.setValue(value); + // } + // if (max.getValue() == null || value.compareTo(max.getValue()) > 0) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(Binary.fromString(min.getValue()), statistics.genericGetMin()); + // assertEquals(Binary.fromString(max.getValue()), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertStringArrayColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableObject min = new MutableObject<>(null); + // MutableObject max = new MutableObject<>(null); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final String value : values) { + // itemCount.increment(); + // if (value == null) { + // nullCount.increment(); + // } else { + // if (min.getValue() == null || value.compareTo(min.getValue()) < 0) { + // min.setValue(value); + // } + // if (max.getValue() == null || value.compareTo(max.getValue()) > 0) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(Binary.fromString(min.getValue()), statistics.genericGetMin()); + // assertEquals(Binary.fromString(max.getValue()), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertStringVectorColumnStatistics(SerialObjectColumnIterator> iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableObject min = new MutableObject<>(null); + // MutableObject max = new MutableObject<>(null); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (String value : values) { + // itemCount.increment(); + // if (value == null) { + // nullCount.increment(); + // } else { + // if (min.getValue() == null || value.compareTo(min.getValue()) < 0) { + // min.setValue(value); + // } + // if (max.getValue() == null || value.compareTo(max.getValue()) > 0) { + // max.setValue(value); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(Binary.fromString(min.getValue()), statistics.genericGetMin()); + // assertEquals(Binary.fromString(max.getValue()), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertInstantColumnStatistic(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableLong min = new MutableLong(NULL_LONG); + // MutableLong max = new MutableLong(NULL_LONG); + // + // iterator.forEachRemaining((value) -> { + // itemCount.increment(); + // if (value == null) { + // nullCount.increment(); + // } else { + // // DateTimeUtils.epochNanos() is the correct conversion for Instant to long. + // if (min.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) < min.getValue()) { + // min.setValue(DateTimeUtils.epochNanos(value)); + // } + // if (max.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) > max.getValue()) { + // max.setValue(DateTimeUtils.epochNanos(value)); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertInstantArrayColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableLong min = new MutableLong(NULL_LONG); + // MutableLong max = new MutableLong(NULL_LONG); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (final Instant value : values) { + // itemCount.increment(); + // if (value == null) { + // nullCount.increment(); + // } else { + // // DateTimeUtils.epochNanos() is the correct conversion for Instant to long. + // if (min.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) < min.getValue()) { + // min.setValue(DateTimeUtils.epochNanos(value)); + // } + // if (max.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) > max.getValue()) { + // max.setValue(DateTimeUtils.epochNanos(value)); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertInstantVectorColumnStatistics(SerialObjectColumnIterator> iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableLong min = new MutableLong(NULL_LONG); + // MutableLong max = new MutableLong(NULL_LONG); + // + // iterator.forEachRemaining(values -> { + // if (values == null) { + // itemCount.increment(); + // nullCount.increment(); + // return; + // } + // for (Instant value : values) { + // itemCount.increment(); + // if (value == null) { + // nullCount.increment(); + // } else { + // // DateTimeUtils.epochNanos() is the correct conversion for Instant to long. + // if (min.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) < min.getValue()) { + // min.setValue(DateTimeUtils.epochNanos(value)); + // } + // if (max.getValue() == NULL_LONG || DateTimeUtils.epochNanos(value) > max.getValue()) { + // max.setValue(DateTimeUtils.epochNanos(value)); + // } + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed + // // values. + // assertEquals(min.getValue(), statistics.genericGetMin()); + // assertEquals(max.getValue(), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertBigDecimalColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableObject min = new MutableObject<>(null); + // MutableObject max = new MutableObject<>(null); + // + // iterator.forEachRemaining((value) -> { + // itemCount.increment(); + // if (value == null) { + // nullCount.increment(); + // } else { + // if (min.getValue() == null || value.compareTo(min.getValue()) < 0) { + // min.setValue(value); + // } + // if (max.getValue() == null || value.compareTo(max.getValue()) > 0) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(Binary.fromConstantByteArray(min.getValue().unscaledValue().toByteArray()), + // statistics.genericGetMin()); + // assertEquals(Binary.fromConstantByteArray(max.getValue().unscaledValue().toByteArray()), + // statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } + // + // private void assertBigIntegerColumnStatistics(SerialObjectColumnIterator iterator, + // Statistics statistics) { + // MutableLong itemCount = new MutableLong(0); + // MutableLong nullCount = new MutableLong(0); + // MutableObject min = new MutableObject<>(null); + // MutableObject max = new MutableObject<>(null); + // + // iterator.forEachRemaining((value) -> { + // itemCount.increment(); + // if (value == null) { + // nullCount.increment(); + // } else { + // if (min.getValue() == null || value.compareTo(min.getValue()) < 0) { + // min.setValue(value); + // } + // if (max.getValue() == null || value.compareTo(max.getValue()) > 0) { + // max.setValue(value); + // } + // } + // }); + // + // assertEquals(nullCount.intValue(), statistics.getNumNulls()); + // if (!itemCount.getValue().equals(nullCount.getValue())) { + // // There are some non-null values, so min and max should be non-null and equal to observed values. + // assertEquals(Binary.fromConstantByteArray(min.getValue().toByteArray()), statistics.genericGetMin()); + // assertEquals(Binary.fromConstantByteArray(max.getValue().toByteArray()), statistics.genericGetMax()); + // } else { + // // Everything is null, statistics should be empty. + // assertFalse(statistics.hasNonNullValue()); + // } + // } // endregion Column Statistics Assertions private static Table checkSingleTable(Table expected, File source) { @@ -3360,7 +3367,13 @@ private static Table checkSingleTable(Table expected, File source) { } private static Table checkSingleTable(Table expected, File source, ParquetInstructions instructions) { - final Table singleTable = readSingleFileTable(source, instructions); + final URI sourceURI; + try { + sourceURI = new URI(source.toString()); + } catch (final URISyntaxException e) { + throw new UncheckedDeephavenException(e); + } + final Table singleTable = readSingleFileTable(sourceURI, instructions); assertTableEquals(expected, singleTable); // Note: we can uncomment out the below lines for extra testing of readTable inference and readSingleTable via // definition, but it's ultimately extra work that we've already explicitly tested.